474
-475
-476
-477
+ | def _prepare_structured_output(
- self, structured_output: "OutlinesStructuredOutputType"
-) -> Union[Callable, None]:
- """Creates the appropriate function to filter tokens to generate structured outputs.
-
- Args:
- structured_output: the configuration dict to prepare the structured output.
+494
+495
+496
+497
| def _prepare_structured_output(
+ self, structured_output: "OutlinesStructuredOutputType"
+) -> Union[Callable, None]:
+ """Creates the appropriate function to filter tokens to generate structured outputs.
- Returns:
- The callable that will be used to guide the generation of the model.
- """
- from distilabel.steps.tasks.structured_outputs.outlines import (
- prepare_guided_output,
- )
-
- assert structured_output is not None, "`structured_output` cannot be `None`"
-
- result = prepare_guided_output(structured_output, "vllm", self._model)
- if (schema := result.get("schema")) and self.structured_output:
- self.structured_output["schema"] = schema
- return result["processor"]
+ Args:
+ structured_output: the configuration dict to prepare the structured output.
+
+ Returns:
+ The callable that will be used to guide the generation of the model.
+ """
+ from distilabel.steps.tasks.structured_outputs.outlines import (
+ prepare_guided_output,
+ )
+
+ assert structured_output is not None, "`structured_output` cannot be `None`"
+
+ result = prepare_guided_output(structured_output, "vllm", self._model)
+ if (schema := result.get("schema")) and self.structured_output:
+ self.structured_output["schema"] = schema
+ return result["processor"]
|
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index 8b7985888..c02ccc875 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Distilabel","text":"Synthesize data for AI and add feedback on the fly! Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers. -
Get started in 5 minutes! Install distilabel with pip and run your first Pipeline to generate and evaluate synthetic data. Quickstart -
How-to guides Get familiar with the basics of distilabel. Learn how to define steps , tasks and llms and run your Pipeline . Learn more "},{"location":"#why-use-distilabel","title":"Why use distilabel?","text":"Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback. Improve your AI output quality through data quality Compute is expensive and output quality is important. We help you focus on data quality, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time achieving and keeping high-quality standards for your synthetic data. Take control of your data and models Ownership of data for fine-tuning your own LLMs is not easy but distilabel can help you to get started. We integrate AI feedback from any LLM provider out there using one unified API. Improve efficiency by quickly iterating on the right data and models Synthesize and judge data with latest research papers while ensuring flexibility, scalability and fault tolerance. So you can focus on improving your data and training your models. "},{"location":"#what-do-people-build-with-distilabel","title":"What do people build with distilabel?","text":"The Argilla community uses distilabel to create amazing datasets and models. - The 1M OpenHermesPreference is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to synthesize data on an immense scale.
- Our distilabeled Intel Orca DPO dataset and the improved OpenHermes model, show how we improve model performance by filtering out 50% of the original dataset through AI feedback.
- The haiku DPO data outlines how anyone can create a dataset for a specific task and the latest research papers to improve the quality of the dataset.
"},{"location":"api/cli/","title":"Command Line Interface (CLI)","text":"This section contains the API reference for the CLI. For more information on how to use the CLI, see Tutorial - CLI. "},{"location":"api/cli/#utility-functions-for-the-distilabel-pipeline-sub-commands","title":"Utility functions for the distilabel pipeline sub-commands","text":"Here are some utility functions to help working with the pipelines in the console. "},{"location":"api/cli/#distilabel.cli.pipeline.utils","title":"utils ","text":""},{"location":"api/cli/#distilabel.cli.pipeline.utils.parse_runtime_parameters","title":"parse_runtime_parameters(params) ","text":"Parses the runtime parameters from the CLI format to the format expected by the Pipeline.run method. The CLI format is a list of tuples, where the first element is a list of keys and the second element is the value. Parameters: Name Type Description Default params List[Tuple[List[str], str]] A list of tuples, where the first element is a list of keys and the second element is the value. required Returns: Type Description Dict[str, Dict[str, Any]] A dictionary with the runtime parameters in the format expected by the Dict[str, Dict[str, Any]] Pipeline.run method. Source code in src/distilabel/cli/pipeline/utils.py def parse_runtime_parameters(\n params: List[Tuple[List[str], str]],\n) -> Dict[str, Dict[str, Any]]:\n \"\"\"Parses the runtime parameters from the CLI format to the format expected by the\n `Pipeline.run` method. The CLI format is a list of tuples, where the first element is\n a list of keys and the second element is the value.\n\n Args:\n params: A list of tuples, where the first element is a list of keys and the\n second element is the value.\n\n Returns:\n A dictionary with the runtime parameters in the format expected by the\n `Pipeline.run` method.\n \"\"\"\n runtime_params = {}\n for keys, value in params:\n current = runtime_params\n for i, key in enumerate(keys):\n if i == len(keys) - 1:\n current[key] = value\n else:\n current = current.setdefault(key, {})\n return runtime_params\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.valid_http_url","title":"valid_http_url(url) ","text":"Check if the URL is a valid HTTP URL. Parameters: Name Type Description Default url str The URL to check. required Returns: Type Description bool True , if the URL is a valid HTTP URL. False , otherwise. Source code in src/distilabel/cli/pipeline/utils.py def valid_http_url(url: str) -> bool:\n \"\"\"Check if the URL is a valid HTTP URL.\n\n Args:\n url: The URL to check.\n\n Returns:\n `True`, if the URL is a valid HTTP URL. `False`, otherwise.\n \"\"\"\n try:\n TypeAdapter(HttpUrl).validate_python(url) # type: ignore\n except ValidationError:\n return False\n\n return True\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_config_from_url","title":"get_config_from_url(url) ","text":"Loads the pipeline configuration from a URL pointing to a JSON or YAML file. Parameters: Name Type Description Default url str The URL pointing to the pipeline configuration file. required Returns: Type Description Dict[str, Any] The pipeline configuration as a dictionary. Raises: Type Description ValueError If the file format is not supported. Source code in src/distilabel/cli/pipeline/utils.py def get_config_from_url(url: str) -> Dict[str, Any]:\n \"\"\"Loads the pipeline configuration from a URL pointing to a JSON or YAML file.\n\n Args:\n url: The URL pointing to the pipeline configuration file.\n\n Returns:\n The pipeline configuration as a dictionary.\n\n Raises:\n ValueError: If the file format is not supported.\n \"\"\"\n if not url.endswith((\".json\", \".yaml\", \".yml\")):\n raise DistilabelUserError(\n f\"Unsupported file format for '{url}'. Only JSON and YAML are supported\",\n page=\"sections/how_to_guides/basic/pipeline/?h=seriali#serializing-the-pipeline\",\n )\n response = _download_remote_file(url)\n\n if url.endswith((\".yaml\", \".yml\")):\n content = response.content.decode(\"utf-8\")\n return yaml.safe_load(content)\n\n return response.json()\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline_from_url","title":"get_pipeline_from_url(url, pipeline_name='pipeline') ","text":"Downloads the file to the current working directory and loads the pipeline object from a python script. Parameters: Name Type Description Default url str The URL pointing to the python script with the pipeline definition. required pipeline_name str The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:... . 'pipeline' Returns: Type Description BasePipeline The pipeline instantiated. Raises: Type Description ValueError If the file format is not supported. Source code in src/distilabel/cli/pipeline/utils.py def get_pipeline_from_url(url: str, pipeline_name: str = \"pipeline\") -> \"BasePipeline\":\n \"\"\"Downloads the file to the current working directory and loads the pipeline object\n from a python script.\n\n Args:\n url: The URL pointing to the python script with the pipeline definition.\n pipeline_name: The name of the pipeline in the script.\n I.e: `with Pipeline(...) as pipeline:...`.\n\n Returns:\n The pipeline instantiated.\n\n Raises:\n ValueError: If the file format is not supported.\n \"\"\"\n if not url.endswith(\".py\"):\n raise DistilabelUserError(\n f\"Unsupported file format for '{url}'. It must be a python file.\",\n page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n )\n response = _download_remote_file(url)\n\n content = response.content.decode(\"utf-8\")\n script_local = Path.cwd() / Path(url).name\n script_local.write_text(content)\n\n # Add the current working directory to sys.path\n sys.path.insert(0, os.getcwd())\n module = importlib.import_module(str(Path(url).stem))\n pipeline = getattr(module, pipeline_name, None)\n if not pipeline:\n raise ImportError(\n f\"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported\"\n )\n\n return pipeline\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline","title":"get_pipeline(config_or_script, pipeline_name='pipeline') ","text":"Get a pipeline from a configuration file or a remote python script. Parameters: Name Type Description Default config_or_script str The path or URL to the pipeline configuration file or URL to a python script. required pipeline_name str The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:... . 'pipeline' Returns: Type Description BasePipeline The pipeline. Raises: Type Description ValueError If the file format is not supported. FileNotFoundError If the configuration file does not exist. Source code in src/distilabel/cli/pipeline/utils.py def get_pipeline(\n config_or_script: str, pipeline_name: str = \"pipeline\"\n) -> \"BasePipeline\":\n \"\"\"Get a pipeline from a configuration file or a remote python script.\n\n Args:\n config_or_script: The path or URL to the pipeline configuration file\n or URL to a python script.\n pipeline_name: The name of the pipeline in the script.\n I.e: `with Pipeline(...) as pipeline:...`.\n\n Returns:\n The pipeline.\n\n Raises:\n ValueError: If the file format is not supported.\n FileNotFoundError: If the configuration file does not exist.\n \"\"\"\n config = script = None\n if config_or_script.endswith((\".json\", \".yaml\", \".yml\")):\n config = config_or_script\n elif config_or_script.endswith(\".py\"):\n script = config_or_script\n else:\n raise DistilabelUserError(\n \"The file must be a valid config file or python script with a pipeline.\",\n page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n )\n\n if valid_http_url(config_or_script):\n if config:\n data = get_config_from_url(config)\n return Pipeline.from_dict(data)\n return get_pipeline_from_url(script, pipeline_name=pipeline_name)\n\n if not config:\n raise ValueError(\n f\"To run a pipeline from a python script, run it as `python {script}`\"\n )\n\n if Path(config).is_file():\n return Pipeline.from_file(config)\n\n raise FileNotFoundError(f\"File '{config_or_script}' does not exist.\")\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.display_pipeline_information","title":"display_pipeline_information(pipeline) ","text":"Displays the pipeline information to the console. Parameters: Name Type Description Default pipeline BasePipeline The pipeline. required Source code in src/distilabel/cli/pipeline/utils.py def display_pipeline_information(pipeline: \"BasePipeline\") -> None:\n \"\"\"Displays the pipeline information to the console.\n\n Args:\n pipeline: The pipeline.\n \"\"\"\n from rich.console import Console\n\n Console().print(_build_pipeline_panel(pipeline))\n "},{"location":"api/distiset/","title":"Distiset","text":"This section contains the API reference for the Distiset. For more information on how to use the CLI, see Tutorial - CLI. "},{"location":"api/distiset/#distilabel.distiset.Distiset","title":"Distiset ","text":" Bases: dict Convenient wrapper around datasets.Dataset to push to the Hugging Face Hub. It's a dictionary where the keys correspond to the different leaf_steps from the internal DAG and the values are datasets.Dataset . Attributes: Name Type Description _pipeline_path Optional[Path] Optional path to the pipeline.yaml file that generated the dataset. Defaults to None . _artifacts_path Optional[Path] Optional path to the directory containing the generated artifacts by the pipeline steps. Defaults to None . _log_filename_path Optional[Path] Optional path to the pipeline.log file that generated was written by the pipeline. Defaults to None . _citations Optional[List[str]] Optional list containing citations that will be included in the dataset card. Defaults to None . Source code in src/distilabel/distiset.py class Distiset(dict):\n \"\"\"Convenient wrapper around `datasets.Dataset` to push to the Hugging Face Hub.\n\n It's a dictionary where the keys correspond to the different leaf_steps from the internal\n `DAG` and the values are `datasets.Dataset`.\n\n Attributes:\n _pipeline_path: Optional path to the `pipeline.yaml` file that generated the dataset.\n Defaults to `None`.\n _artifacts_path: Optional path to the directory containing the generated artifacts\n by the pipeline steps. Defaults to `None`.\n _log_filename_path: Optional path to the `pipeline.log` file that generated was written\n by the pipeline. Defaults to `None`.\n _citations: Optional list containing citations that will be included in the dataset\n card. Defaults to `None`.\n \"\"\"\n\n _pipeline_path: Optional[Path] = None\n _artifacts_path: Optional[Path] = None\n _log_filename_path: Optional[Path] = None\n _citations: Optional[List[str]] = None\n\n def push_to_hub(\n self,\n repo_id: str,\n private: bool = False,\n token: Optional[str] = None,\n generate_card: bool = True,\n include_script: bool = False,\n **kwargs: Any,\n ) -> None:\n \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n corresponding to the leaf step that generated it.\n\n Args:\n repo_id:\n The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n of the logged-in user.\n private:\n Whether the dataset repository should be set to private or not. Only affects repository creation:\n a repository that already exists will not be affected by that parameter.\n token:\n An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n if no token is passed and the user is not logged-in.\n generate_card:\n Whether to generate a dataset card or not. Defaults to True.\n include_script:\n Whether you want to push the pipeline script to the hugging face hub to share it.\n If set to True, the name of the script that was run to create the distiset will be\n automatically determined, and that will be the name of the file uploaded to your\n repository. Take into account, this operation only makes sense for a distiset obtained\n from calling `Pipeline.run()` method. Defaults to False.\n **kwargs:\n Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n Raises:\n ValueError: If no token is provided and couldn't be retrieved automatically.\n \"\"\"\n script_filename = sys.argv[0]\n filename_py = (\n script_filename.split(\"/\")[-1]\n if \"/\" in script_filename\n else script_filename\n )\n script_path = Path.cwd() / script_filename\n\n if token is None:\n token = get_hf_token(self.__class__.__name__, \"token\")\n\n for name, dataset in self.items():\n dataset.push_to_hub(\n repo_id=repo_id,\n config_name=name,\n private=private,\n token=token,\n **kwargs,\n )\n\n if self.artifacts_path:\n upload_folder(\n repo_id=repo_id,\n folder_path=self.artifacts_path,\n path_in_repo=\"artifacts\",\n token=token,\n repo_type=\"dataset\",\n commit_message=\"Include pipeline artifacts\",\n )\n\n if include_script and script_path.exists():\n upload_file(\n path_or_fileobj=script_path,\n path_in_repo=filename_py,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n commit_message=\"Include pipeline script\",\n )\n\n if generate_card:\n self._generate_card(\n repo_id, token, include_script=include_script, filename_py=filename_py\n )\n\n def _get_card(\n self,\n repo_id: str,\n token: Optional[str] = None,\n include_script: bool = False,\n filename_py: Optional[str] = None,\n ) -> DistilabelDatasetCard:\n \"\"\"Generates the dataset card for the `Distiset`.\n\n Note:\n If `repo_id` and `token` are provided, it will extract the metadata from the README.md file\n on the hub.\n\n Args:\n repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.\n token: The token to authenticate with the Hugging Face Hub.\n We assume that if it's provided, the dataset will be in the Hugging Face Hub,\n so the README metadata will be extracted from there.\n include_script: Whether to upload the script to the hugging face repository.\n filename_py: The name of the script. If `include_script` is True, the script will\n be uploaded to the repository using this name, otherwise it won't be used.\n\n Returns:\n The dataset card for the `Distiset`.\n \"\"\"\n sample_records = {}\n for name, dataset in self.items():\n record = (\n dataset[0] if not isinstance(dataset, dict) else dataset[\"train\"][0]\n )\n for key, value in record.items():\n # If list is too big, the `README.md` generated will be huge so we truncate it\n if isinstance(value, list):\n length = len(value)\n if length < 10:\n continue\n record[key] = value[:10]\n record[key].append(\n f\"... (truncated - showing 10 of {length} elements)\"\n )\n sample_records[name] = record\n\n readme_metadata = {}\n if repo_id and token:\n readme_metadata = self._extract_readme_metadata(repo_id, token)\n\n metadata = {\n **readme_metadata,\n \"size_categories\": size_categories_parser(\n max(len(dataset) for dataset in self.values())\n ),\n \"tags\": [\"synthetic\", \"distilabel\", \"rlaif\"],\n }\n\n card = DistilabelDatasetCard.from_template(\n card_data=DatasetCardData(**metadata),\n repo_id=repo_id,\n sample_records=sample_records,\n include_script=include_script,\n filename_py=filename_py,\n artifacts=self._get_artifacts_metadata(),\n references=self.citations,\n )\n\n return card\n\n def _get_artifacts_metadata(self) -> Dict[str, List[Dict[str, Any]]]:\n \"\"\"Gets a dictionary with the metadata of the artifacts generated by the pipeline steps.\n\n Returns:\n A dictionary in which the key is the name of the step and the value is a list\n of dictionaries, each of them containing the name and metadata of the step artifact.\n \"\"\"\n if not self.artifacts_path:\n return {}\n\n def iterdir_ignore_hidden(path: Path) -> Generator[Path, None, None]:\n return (f for f in Path(path).iterdir() if not f.name.startswith(\".\"))\n\n artifacts_metadata = defaultdict(list)\n for step_artifacts_dir in iterdir_ignore_hidden(self.artifacts_path):\n step_name = step_artifacts_dir.stem\n for artifact_dir in iterdir_ignore_hidden(step_artifacts_dir):\n artifact_name = artifact_dir.stem\n metadata_path = artifact_dir / \"metadata.json\"\n metadata = json.loads(metadata_path.read_text())\n artifacts_metadata[step_name].append(\n {\"name\": artifact_name, \"metadata\": metadata}\n )\n\n return dict(artifacts_metadata)\n\n def _extract_readme_metadata(\n self, repo_id: str, token: Optional[str]\n ) -> Dict[str, Any]:\n \"\"\"Extracts the metadata from the README.md file of the dataset repository.\n\n We have to download the previous README.md file in the repo, extract the metadata from it,\n and generate a dict again to be passed thorough the `DatasetCardData` object.\n\n Args:\n repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n\n Returns:\n The metadata extracted from the README.md file of the dataset repository as a dict.\n \"\"\"\n readme_path = Path(\n hf_hub_download(repo_id, \"README.md\", repo_type=\"dataset\", token=token)\n )\n # Remove the '---' from the metadata\n metadata = re.findall(r\"---\\n(.*?)\\n---\", readme_path.read_text(), re.DOTALL)[0]\n metadata = yaml.safe_load(metadata)\n return metadata\n\n def _generate_card(\n self,\n repo_id: str,\n token: str,\n include_script: bool = False,\n filename_py: Optional[str] = None,\n ) -> None:\n \"\"\"Generates a dataset card and pushes it to the Hugging Face Hub, and\n if the `pipeline.yaml` path is available in the `Distiset`, uploads that\n to the same repository.\n\n Args:\n repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n include_script: Whether to upload the script to the hugging face repository.\n filename_py: The name of the script. If `include_script` is True, the script will\n be uploaded to the repository using this name, otherwise it won't be used.\n \"\"\"\n card = self._get_card(\n repo_id=repo_id,\n token=token,\n include_script=include_script,\n filename_py=filename_py,\n )\n\n card.push_to_hub(\n repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n if self.pipeline_path:\n # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.\n HfApi().upload_file(\n path_or_fileobj=self.pipeline_path,\n path_in_repo=PIPELINE_CONFIG_FILENAME,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n if self.log_filename_path:\n # The same we had with \"pipeline.yaml\" but with the log file.\n HfApi().upload_file(\n path_or_fileobj=self.log_filename_path,\n path_in_repo=PIPELINE_LOG_FILENAME,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n def train_test_split(\n self,\n train_size: float,\n shuffle: bool = True,\n seed: Optional[int] = None,\n ) -> Self:\n \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n Splits are created from the dataset according to `train_size` and `shuffle`.\n\n Args:\n train_size:\n Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n It will be applied to all the datasets in the `Distiset`.\n shuffle: Whether or not to shuffle the data before splitting\n seed:\n A seed to initialize the default BitGenerator, passed to the underlying method.\n\n Returns:\n The `Distiset` with the train-test split applied to all the datasets.\n \"\"\"\n assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n for name, dataset in self.items():\n self[name] = dataset.train_test_split(\n train_size=train_size,\n shuffle=shuffle,\n seed=seed,\n )\n return self\n\n def save_to_disk(\n self,\n distiset_path: PathLike,\n max_shard_size: Optional[Union[str, int]] = None,\n num_shards: Optional[int] = None,\n num_proc: Optional[int] = None,\n storage_options: Optional[dict] = None,\n save_card: bool = True,\n save_pipeline_config: bool = True,\n save_pipeline_log: bool = True,\n ) -> None:\n r\"\"\"\n Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n Args:\n distiset_path: Path where you want to save the `Distiset`. It can be a local path\n (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n Defaults to `None`.\n num_shards: Number of shards to write. By default the number of shards depends on\n `max_shard_size` and `num_proc`. Defaults to `None`.\n num_proc: Number of processes when downloading and generating the dataset locally.\n Multiprocessing is disabled by default. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n save_card: Whether to save the dataset card. Defaults to `True`.\n save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n Defaults to `True`.\n save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n Defaults to `True`.\n\n Examples:\n ```python\n # Save your distiset in a local folder:\n distiset.save_to_disk(distiset_path=\"my-distiset\")\n # Save your distiset in a remote storage:\n storage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n }\n distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n ```\n \"\"\"\n distiset_path = str(distiset_path)\n for name, dataset in self.items():\n dataset.save_to_disk(\n f\"{distiset_path}/{name}\",\n max_shard_size=max_shard_size,\n num_shards=num_shards,\n num_proc=num_proc,\n storage_options=storage_options,\n )\n\n distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n fs: fsspec.AbstractFileSystem\n fs, _, _ = fsspec.get_fs_token_paths(\n distiset_config_folder, storage_options=storage_options\n )\n fs.makedirs(distiset_config_folder, exist_ok=True)\n\n if self.artifacts_path:\n distiset_artifacts_folder = posixpath.join(\n distiset_path, DISTISET_ARTIFACTS_FOLDER\n )\n fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n if save_card:\n # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n # as we aren't generating the README copying/updating the data from the dataset repo.\n card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n if storage_options:\n # Write the card the same way as DatasetCard.save does:\n with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n f.write(str(card))\n else:\n card.save(new_filename)\n\n # Write our internal files to the distiset folder by copying them to the distiset folder.\n if save_pipeline_config and self.pipeline_path:\n new_filename = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n data = yaml.safe_load(self.pipeline_path.read_text())\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n yaml.dump(data, f, default_flow_style=False)\n\n if save_pipeline_log and self.log_filename_path:\n new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n data = self.log_filename_path.read_text()\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n f.write(data)\n\n @classmethod\n def load_from_disk(\n cls,\n distiset_path: PathLike,\n keep_in_memory: Optional[bool] = None,\n storage_options: Optional[Dict[str, Any]] = None,\n download_dir: Optional[PathLike] = None,\n ) -> Self:\n \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n Args:\n distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n for more information. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n download_dir: Optional directory to download the dataset to. Defaults to None,\n in which case it will create a temporary directory.\n\n Returns:\n A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n \"\"\"\n original_distiset_path = str(distiset_path)\n\n fs: fsspec.AbstractFileSystem\n fs, _, [distiset_path] = fsspec.get_fs_token_paths( # type: ignore\n original_distiset_path, storage_options=storage_options\n )\n dest_distiset_path = distiset_path\n\n assert fs.isdir(\n original_distiset_path\n ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n has_config = False\n has_artifacts = False\n distiset = cls()\n\n if is_remote_filesystem(fs):\n src_dataset_path = distiset_path\n if download_dir:\n dest_distiset_path = download_dir\n else:\n dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path) # type: ignore\n fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True) # type: ignore\n\n # Now we should have the distiset locally, so we can read those files\n for folder in Path(dest_distiset_path).iterdir():\n if folder.stem == DISTISET_CONFIG_FOLDER:\n has_config = True\n continue\n elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n has_artifacts = True\n continue\n distiset[folder.stem] = load_from_disk(\n str(folder),\n keep_in_memory=keep_in_memory,\n )\n\n # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n # to wherever they are.\n if has_config:\n distiset_config_folder = posixpath.join(\n dest_distiset_path, DISTISET_CONFIG_FOLDER\n )\n\n pipeline_path = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if Path(pipeline_path).exists():\n distiset.pipeline_path = Path(pipeline_path)\n\n log_filename_path = posixpath.join(\n distiset_config_folder, PIPELINE_LOG_FILENAME\n )\n if Path(log_filename_path).exists():\n distiset.log_filename_path = Path(log_filename_path)\n\n if has_artifacts:\n distiset.artifacts_path = Path(\n posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n )\n\n return distiset\n\n @property\n def pipeline_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the `pipeline.yaml` file that generated the `Pipeline`.\"\"\"\n return self._pipeline_path\n\n @pipeline_path.setter\n def pipeline_path(self, path: PathLike) -> None:\n self._pipeline_path = Path(path)\n\n @property\n def artifacts_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the directory containing the artifacts generated by the steps\n of the pipeline.\"\"\"\n return self._artifacts_path\n\n @artifacts_path.setter\n def artifacts_path(self, path: PathLike) -> None:\n self._artifacts_path = Path(path)\n\n @property\n def log_filename_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the `pipeline.log` file that generated the `Pipeline`.\"\"\"\n return self._log_filename_path\n\n @log_filename_path.setter\n def log_filename_path(self, path: PathLike) -> None:\n self._log_filename_path = Path(path)\n\n @property\n def citations(self) -> Union[List[str], None]:\n \"\"\"Bibtex references to be included in the README.\"\"\"\n return self._citations\n\n @citations.setter\n def citations(self, citations_: List[str]) -> None:\n self._citations = sorted(set(citations_))\n\n def __repr__(self):\n # Copy from `datasets.DatasetDict.__repr__`.\n repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n return f\"Distiset({{\\n{repr}\\n}})\"\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.pipeline_path","title":"pipeline_path: Union[Path, None] property writable ","text":"Returns the path to the pipeline.yaml file that generated the Pipeline . "},{"location":"api/distiset/#distilabel.distiset.Distiset.artifacts_path","title":"artifacts_path: Union[Path, None] property writable ","text":"Returns the path to the directory containing the artifacts generated by the steps of the pipeline. "},{"location":"api/distiset/#distilabel.distiset.Distiset.log_filename_path","title":"log_filename_path: Union[Path, None] property writable ","text":"Returns the path to the pipeline.log file that generated the Pipeline . "},{"location":"api/distiset/#distilabel.distiset.Distiset.citations","title":"citations: Union[List[str], None] property writable ","text":"Bibtex references to be included in the README. "},{"location":"api/distiset/#distilabel.distiset.Distiset.push_to_hub","title":"push_to_hub(repo_id, private=False, token=None, generate_card=True, include_script=False, **kwargs) ","text":"Pushes the Distiset to the Hugging Face Hub, each dataset will be pushed as a different configuration corresponding to the leaf step that generated it. Parameters: Name Type Description Default repo_id str The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name> . Also accepts <dataset_name> , which will default to the namespace of the logged-in user. required private bool Whether the dataset repository should be set to private or not. Only affects repository creation: a repository that already exists will not be affected by that parameter. False token Optional[str] An optional authentication token for the Hugging Face Hub. If no token is passed, will default to the token saved locally when logging in with huggingface-cli login . Will raise an error if no token is passed and the user is not logged-in. None generate_card bool Whether to generate a dataset card or not. Defaults to True. True include_script bool Whether you want to push the pipeline script to the hugging face hub to share it. If set to True, the name of the script that was run to create the distiset will be automatically determined, and that will be the name of the file uploaded to your repository. Take into account, this operation only makes sense for a distiset obtained from calling Pipeline.run() method. Defaults to False. False **kwargs Any Additional keyword arguments to pass to the push_to_hub method of the datasets.Dataset object. {} Raises: Type Description ValueError If no token is provided and couldn't be retrieved automatically. Source code in src/distilabel/distiset.py def push_to_hub(\n self,\n repo_id: str,\n private: bool = False,\n token: Optional[str] = None,\n generate_card: bool = True,\n include_script: bool = False,\n **kwargs: Any,\n) -> None:\n \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n corresponding to the leaf step that generated it.\n\n Args:\n repo_id:\n The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n of the logged-in user.\n private:\n Whether the dataset repository should be set to private or not. Only affects repository creation:\n a repository that already exists will not be affected by that parameter.\n token:\n An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n if no token is passed and the user is not logged-in.\n generate_card:\n Whether to generate a dataset card or not. Defaults to True.\n include_script:\n Whether you want to push the pipeline script to the hugging face hub to share it.\n If set to True, the name of the script that was run to create the distiset will be\n automatically determined, and that will be the name of the file uploaded to your\n repository. Take into account, this operation only makes sense for a distiset obtained\n from calling `Pipeline.run()` method. Defaults to False.\n **kwargs:\n Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n Raises:\n ValueError: If no token is provided and couldn't be retrieved automatically.\n \"\"\"\n script_filename = sys.argv[0]\n filename_py = (\n script_filename.split(\"/\")[-1]\n if \"/\" in script_filename\n else script_filename\n )\n script_path = Path.cwd() / script_filename\n\n if token is None:\n token = get_hf_token(self.__class__.__name__, \"token\")\n\n for name, dataset in self.items():\n dataset.push_to_hub(\n repo_id=repo_id,\n config_name=name,\n private=private,\n token=token,\n **kwargs,\n )\n\n if self.artifacts_path:\n upload_folder(\n repo_id=repo_id,\n folder_path=self.artifacts_path,\n path_in_repo=\"artifacts\",\n token=token,\n repo_type=\"dataset\",\n commit_message=\"Include pipeline artifacts\",\n )\n\n if include_script and script_path.exists():\n upload_file(\n path_or_fileobj=script_path,\n path_in_repo=filename_py,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n commit_message=\"Include pipeline script\",\n )\n\n if generate_card:\n self._generate_card(\n repo_id, token, include_script=include_script, filename_py=filename_py\n )\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.train_test_split","title":"train_test_split(train_size, shuffle=True, seed=None) ","text":"Return a Distiset whose values will be a datasets.DatasetDict with two random train and test subsets. Splits are created from the dataset according to train_size and shuffle . Parameters: Name Type Description Default train_size float Float between 0.0 and 1.0 representing the proportion of the dataset to include in the test split. It will be applied to all the datasets in the Distiset . required shuffle bool Whether or not to shuffle the data before splitting True seed Optional[int] A seed to initialize the default BitGenerator, passed to the underlying method. None Returns: Type Description Self The Distiset with the train-test split applied to all the datasets. Source code in src/distilabel/distiset.py def train_test_split(\n self,\n train_size: float,\n shuffle: bool = True,\n seed: Optional[int] = None,\n) -> Self:\n \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n Splits are created from the dataset according to `train_size` and `shuffle`.\n\n Args:\n train_size:\n Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n It will be applied to all the datasets in the `Distiset`.\n shuffle: Whether or not to shuffle the data before splitting\n seed:\n A seed to initialize the default BitGenerator, passed to the underlying method.\n\n Returns:\n The `Distiset` with the train-test split applied to all the datasets.\n \"\"\"\n assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n for name, dataset in self.items():\n self[name] = dataset.train_test_split(\n train_size=train_size,\n shuffle=shuffle,\n seed=seed,\n )\n return self\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.save_to_disk","title":"save_to_disk(distiset_path, max_shard_size=None, num_shards=None, num_proc=None, storage_options=None, save_card=True, save_pipeline_config=True, save_pipeline_log=True) ","text":"Saves a Distiset to a dataset directory, or in a filesystem using any implementation of fsspec.spec.AbstractFileSystem . In case you want to save the Distiset in a remote filesystem, you can pass the storage_options parameter as you would do with datasets 's Dataset.save_to_disk method: see example Parameters: Name Type Description Default distiset_path PathLike Path where you want to save the Distiset . It can be a local path (e.g. dataset/train ) or remote URI (e.g. s3://my-bucket/dataset/train ) required max_shard_size Optional[Union[str, int]] The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit (like \"50MB\" ). Defaults to None . None num_shards Optional[int] Number of shards to write. By default the number of shards depends on max_shard_size and num_proc . Defaults to None . None num_proc Optional[int] Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. Defaults to None . None storage_options Optional[dict] Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . None save_card bool Whether to save the dataset card. Defaults to True . True save_pipeline_config bool Whether to save the pipeline configuration file (aka the pipeline.yaml file). Defaults to True . True save_pipeline_log bool Whether to save the pipeline log file (aka the pipeline.log file). Defaults to True . True Examples: # Save your distiset in a local folder:\ndistiset.save_to_disk(distiset_path=\"my-distiset\")\n# Save your distiset in a remote storage:\nstorage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n}\ndistiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n Source code in src/distilabel/distiset.py def save_to_disk(\n self,\n distiset_path: PathLike,\n max_shard_size: Optional[Union[str, int]] = None,\n num_shards: Optional[int] = None,\n num_proc: Optional[int] = None,\n storage_options: Optional[dict] = None,\n save_card: bool = True,\n save_pipeline_config: bool = True,\n save_pipeline_log: bool = True,\n) -> None:\n r\"\"\"\n Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n Args:\n distiset_path: Path where you want to save the `Distiset`. It can be a local path\n (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n Defaults to `None`.\n num_shards: Number of shards to write. By default the number of shards depends on\n `max_shard_size` and `num_proc`. Defaults to `None`.\n num_proc: Number of processes when downloading and generating the dataset locally.\n Multiprocessing is disabled by default. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n save_card: Whether to save the dataset card. Defaults to `True`.\n save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n Defaults to `True`.\n save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n Defaults to `True`.\n\n Examples:\n ```python\n # Save your distiset in a local folder:\n distiset.save_to_disk(distiset_path=\"my-distiset\")\n # Save your distiset in a remote storage:\n storage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n }\n distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n ```\n \"\"\"\n distiset_path = str(distiset_path)\n for name, dataset in self.items():\n dataset.save_to_disk(\n f\"{distiset_path}/{name}\",\n max_shard_size=max_shard_size,\n num_shards=num_shards,\n num_proc=num_proc,\n storage_options=storage_options,\n )\n\n distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n fs: fsspec.AbstractFileSystem\n fs, _, _ = fsspec.get_fs_token_paths(\n distiset_config_folder, storage_options=storage_options\n )\n fs.makedirs(distiset_config_folder, exist_ok=True)\n\n if self.artifacts_path:\n distiset_artifacts_folder = posixpath.join(\n distiset_path, DISTISET_ARTIFACTS_FOLDER\n )\n fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n if save_card:\n # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n # as we aren't generating the README copying/updating the data from the dataset repo.\n card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n if storage_options:\n # Write the card the same way as DatasetCard.save does:\n with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n f.write(str(card))\n else:\n card.save(new_filename)\n\n # Write our internal files to the distiset folder by copying them to the distiset folder.\n if save_pipeline_config and self.pipeline_path:\n new_filename = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n data = yaml.safe_load(self.pipeline_path.read_text())\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n yaml.dump(data, f, default_flow_style=False)\n\n if save_pipeline_log and self.log_filename_path:\n new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n data = self.log_filename_path.read_text()\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n f.write(data)\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.load_from_disk","title":"load_from_disk(distiset_path, keep_in_memory=None, storage_options=None, download_dir=None) classmethod ","text":"Loads a dataset that was previously saved using Distiset.save_to_disk from a dataset directory, or from a filesystem using any implementation of fsspec.spec.AbstractFileSystem . Parameters: Name Type Description Default distiset_path PathLike Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\"). required keep_in_memory Optional[bool] Whether to copy the dataset in-memory, see datasets.Dataset.load_from_disk`` for more information. Defaults to None`. None storage_options Optional[Dict[str, Any]] Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . None download_dir Optional[PathLike] Optional directory to download the dataset to. Defaults to None, in which case it will create a temporary directory. None Returns: Type Description Self A Distiset loaded from disk, it should be a Distiset object created using Distiset.save_to_disk . Source code in src/distilabel/distiset.py @classmethod\ndef load_from_disk(\n cls,\n distiset_path: PathLike,\n keep_in_memory: Optional[bool] = None,\n storage_options: Optional[Dict[str, Any]] = None,\n download_dir: Optional[PathLike] = None,\n) -> Self:\n \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n Args:\n distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n for more information. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n download_dir: Optional directory to download the dataset to. Defaults to None,\n in which case it will create a temporary directory.\n\n Returns:\n A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n \"\"\"\n original_distiset_path = str(distiset_path)\n\n fs: fsspec.AbstractFileSystem\n fs, _, [distiset_path] = fsspec.get_fs_token_paths( # type: ignore\n original_distiset_path, storage_options=storage_options\n )\n dest_distiset_path = distiset_path\n\n assert fs.isdir(\n original_distiset_path\n ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n has_config = False\n has_artifacts = False\n distiset = cls()\n\n if is_remote_filesystem(fs):\n src_dataset_path = distiset_path\n if download_dir:\n dest_distiset_path = download_dir\n else:\n dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path) # type: ignore\n fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True) # type: ignore\n\n # Now we should have the distiset locally, so we can read those files\n for folder in Path(dest_distiset_path).iterdir():\n if folder.stem == DISTISET_CONFIG_FOLDER:\n has_config = True\n continue\n elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n has_artifacts = True\n continue\n distiset[folder.stem] = load_from_disk(\n str(folder),\n keep_in_memory=keep_in_memory,\n )\n\n # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n # to wherever they are.\n if has_config:\n distiset_config_folder = posixpath.join(\n dest_distiset_path, DISTISET_CONFIG_FOLDER\n )\n\n pipeline_path = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if Path(pipeline_path).exists():\n distiset.pipeline_path = Path(pipeline_path)\n\n log_filename_path = posixpath.join(\n distiset_config_folder, PIPELINE_LOG_FILENAME\n )\n if Path(log_filename_path).exists():\n distiset.log_filename_path = Path(log_filename_path)\n\n if has_artifacts:\n distiset.artifacts_path = Path(\n posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n )\n\n return distiset\n "},{"location":"api/distiset/#distilabel.distiset.create_distiset","title":"create_distiset(data_dir, pipeline_path=None, log_filename_path=None, enable_metadata=False, dag=None) ","text":"Creates a Distiset from the buffer folder. This function is intended to be used as a helper to create a Distiset from from the folder where the cached data was written by the _WriteBuffer . Parameters: Name Type Description Default data_dir Path Folder where the data buffers were written by the _WriteBuffer . It should correspond to CacheLocation.data . required pipeline_path Optional[Path] Optional path to the pipeline.yaml file that generated the dataset. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.yaml file to the repo upon Distiset.push_to_hub . None log_filename_path Optional[Path] Optional path to the pipeline.log file that was generated during the pipeline run. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.log file to the repo upon Distiset.push_to_hub . None enable_metadata bool Whether to include the distilabel metadata column in the dataset or not. Defaults to False . False dag Optional[DAG] DAG contained in a Pipeline . If informed, will be used to extract the references/ citations from it. None Returns: Type Description Distiset The dataset created from the buffer folder, where the different leaf steps will Distiset correspond to different configurations of the dataset. Examples: from pathlib import Path\ndistiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n Source code in src/distilabel/distiset.py def create_distiset( # noqa: C901\n data_dir: Path,\n pipeline_path: Optional[Path] = None,\n log_filename_path: Optional[Path] = None,\n enable_metadata: bool = False,\n dag: Optional[\"DAG\"] = None,\n) -> Distiset:\n \"\"\"Creates a `Distiset` from the buffer folder.\n\n This function is intended to be used as a helper to create a `Distiset` from from the folder\n where the cached data was written by the `_WriteBuffer`.\n\n Args:\n data_dir: Folder where the data buffers were written by the `_WriteBuffer`.\n It should correspond to `CacheLocation.data`.\n pipeline_path: Optional path to the pipeline.yaml file that generated the dataset.\n Internally this will be passed to the `Distiset` object on creation to allow\n uploading the `pipeline.yaml` file to the repo upon `Distiset.push_to_hub`.\n log_filename_path: Optional path to the pipeline.log file that was generated during the pipeline run.\n Internally this will be passed to the `Distiset` object on creation to allow\n uploading the `pipeline.log` file to the repo upon `Distiset.push_to_hub`.\n enable_metadata: Whether to include the distilabel metadata column in the dataset or not.\n Defaults to `False`.\n dag: DAG contained in a `Pipeline`. If informed, will be used to extract the references/\n citations from it.\n\n Returns:\n The dataset created from the buffer folder, where the different leaf steps will\n correspond to different configurations of the dataset.\n\n Examples:\n ```python\n from pathlib import Path\n distiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n ```\n \"\"\"\n from distilabel.constants import DISTILABEL_METADATA_KEY\n\n logger = logging.getLogger(\"distilabel.distiset\")\n\n steps_outputs_dir = data_dir / STEPS_OUTPUTS_PATH\n\n distiset = Distiset()\n for file in steps_outputs_dir.iterdir():\n if file.is_file():\n continue\n\n files = [str(file) for file in list_files_in_dir(file)]\n if files:\n try:\n ds = load_dataset(\n \"parquet\", name=file.stem, data_files={\"train\": files}\n )\n if not enable_metadata and DISTILABEL_METADATA_KEY in ds.column_names:\n ds = ds.remove_columns(DISTILABEL_METADATA_KEY)\n distiset[file.stem] = ds\n except ArrowInvalid:\n logger.warning(f\"\u274c Failed to load the subset from '{file}' directory.\")\n continue\n else:\n logger.warning(\n f\"No output files for step '{file.stem}', can't create a dataset.\"\n \" Did the step produce any data?\"\n )\n\n # If there's only one dataset i.e. one config, then set the config name to `default`\n if len(distiset.keys()) == 1:\n distiset[\"default\"] = distiset.pop(list(distiset.keys())[0])\n\n # If there's any artifact set the `artifacts_path` so they can be uploaded\n steps_artifacts_dir = data_dir / STEPS_ARTIFACTS_PATH\n if any(steps_artifacts_dir.rglob(\"*\")):\n distiset.artifacts_path = steps_artifacts_dir\n\n # Include `pipeline.yaml` if exists\n if pipeline_path:\n distiset.pipeline_path = pipeline_path\n else:\n # If the pipeline path is not provided, try to find it in the parent directory\n # and assume that's the wanted file.\n pipeline_path = steps_outputs_dir.parent / \"pipeline.yaml\"\n if pipeline_path.exists():\n distiset.pipeline_path = pipeline_path\n\n # Include `pipeline.log` if exists\n if log_filename_path:\n distiset.log_filename_path = log_filename_path\n else:\n log_filename_path = steps_outputs_dir.parent / \"pipeline.log\"\n if log_filename_path.exists():\n distiset.log_filename_path = log_filename_path\n\n if dag:\n distiset._citations = _grab_citations(dag)\n\n return distiset\n "},{"location":"api/errors/","title":"Errors","text":"This section contains the distilabel custom errors. Unlike exceptions, errors in distilabel are used to handle unexpected situations that can't be anticipated and that can't be handled in a controlled way. "},{"location":"api/errors/#distilabel.errors.DistilabelError","title":"DistilabelError ","text":"A mixin class for common functionality shared by all Distilabel-specific errors. Attributes: Name Type Description message A message describing the error. page An optional error code from PydanticErrorCodes enum. Examples: raise DistilabelUserError(\"This is an error message.\")\nThis is an error message.\n\nraise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\nThis is an error message.\nFor further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n Source code in src/distilabel/errors.py class DistilabelError:\n \"\"\"A mixin class for common functionality shared by all Distilabel-specific errors.\n\n Attributes:\n message: A message describing the error.\n page: An optional error code from PydanticErrorCodes enum.\n\n Examples:\n ```python\n raise DistilabelUserError(\"This is an error message.\")\n This is an error message.\n\n raise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\n This is an error message.\n For further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n ```\n \"\"\"\n\n def __init__(self, message: str, *, page: Optional[str] = None) -> None:\n self.message = message\n self.page = page\n\n def __str__(self) -> str:\n if self.page is None:\n return self.message\n else:\n return f\"{self.message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}{self.page}'\"\n "},{"location":"api/errors/#distilabel.errors.DistilabelUserError","title":"DistilabelUserError ","text":" Bases: DistilabelError , ValueError ValueError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelUserError(DistilabelError, ValueError):\n \"\"\"ValueError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/errors/#distilabel.errors.DistilabelTypeError","title":"DistilabelTypeError ","text":" Bases: DistilabelError , TypeError TypeError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelTypeError(DistilabelError, TypeError):\n \"\"\"TypeError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/errors/#distilabel.errors.DistilabelNotImplementedError","title":"DistilabelNotImplementedError ","text":" Bases: DistilabelError , NotImplementedError NotImplementedError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelNotImplementedError(DistilabelError, NotImplementedError):\n \"\"\"NotImplementedError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/exceptions/","title":"Exceptions","text":"This section contains the distilabel custom exceptions. Unlike errors, exceptions in distilabel are used to handle specific situations that can be anticipated and that can be handled in a controlled way internally by the library. "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelException","title":"DistilabelException ","text":" Bases: Exception Base exception (can be gracefully handled) for distilabel framework. Source code in src/distilabel/exceptions.py class DistilabelException(Exception):\n \"\"\"Base exception (can be gracefully handled) for `distilabel` framework.\"\"\"\n\n pass\n "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelGenerationException","title":"DistilabelGenerationException ","text":" Bases: DistilabelException Base exception for LLM generation errors. Source code in src/distilabel/exceptions.py class DistilabelGenerationException(DistilabelException):\n \"\"\"Base exception for `LLM` generation errors.\"\"\"\n\n pass\n "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelOfflineBatchGenerationNotFinishedException","title":"DistilabelOfflineBatchGenerationNotFinishedException ","text":" Bases: DistilabelGenerationException Exception raised when a batch generation is not finished. Source code in src/distilabel/exceptions.py class DistilabelOfflineBatchGenerationNotFinishedException(\n DistilabelGenerationException\n):\n \"\"\"Exception raised when a batch generation is not finished.\"\"\"\n\n jobs_ids: Tuple[str, ...]\n\n def __init__(self, jobs_ids: Tuple[str, ...]) -> None:\n self.jobs_ids = jobs_ids\n super().__init__(f\"Batch generation with jobs_ids={jobs_ids} is not finished\")\n "},{"location":"api/mixins/requirements/","title":"RequirementsMixin","text":""},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin","title":"RequirementsMixin ","text":"Mixin for classes that have requirements attribute. Used to add requirements to a Step and a Pipeline . Source code in src/distilabel/mixins/requirements.py class RequirementsMixin:\n \"\"\"Mixin for classes that have `requirements` attribute.\n\n Used to add requirements to a `Step` and a `Pipeline`.\n \"\"\"\n\n _requirements: Union[List[Requirement], None] = []\n\n def _gather_requirements(self) -> List[str]:\n \"\"\"This method will be overwritten in the `BasePipeline` class to gather the requirements\n from each step.\n \"\"\"\n return []\n\n @property\n def requirements(self) -> List[str]:\n \"\"\"Return a list of requirements that must be installed to run the `Pipeline`.\n\n The requirements in a Pipeline will include the requirements from all the steps (if any).\n\n Returns:\n List of requirements that must be installed to run the `Pipeline`, sorted alphabetically.\n \"\"\"\n self.requirements = self._gather_requirements()\n\n return [str(r) for r in self._requirements]\n\n @requirements.setter\n def requirements(self, _requirements: List[str]) -> None:\n requirements = []\n if not isinstance(_requirements, list):\n _requirements = [_requirements]\n\n for r in _requirements:\n try:\n requirements.append(Requirement(r))\n except InvalidRequirement:\n self._logger.warning(f\"Invalid requirement: `{r}`\")\n\n self._requirements = sorted(\n set(self._requirements).union(set(requirements)), key=lambda x: str(x)\n )\n\n def requirements_to_install(self) -> List[str]:\n \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n Returns:\n List of requirements required to run the pipeline that are not installed in the current environment.\n \"\"\"\n\n to_install = []\n for req in self.requirements:\n requirement = Requirement(req)\n if importlib.util.find_spec(requirement.name):\n if (str(requirement.specifier) != \"\") and (\n version(requirement.name) != str(requirement.specifier)\n ):\n to_install.append(req)\n else:\n to_install.append(req)\n return to_install\n "},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements","title":"requirements: List[str] property writable ","text":"Return a list of requirements that must be installed to run the Pipeline . The requirements in a Pipeline will include the requirements from all the steps (if any). Returns: Type Description List[str] List of requirements that must be installed to run the Pipeline , sorted alphabetically. "},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements_to_install","title":"requirements_to_install() ","text":"Check if the requirements are installed in the current environment, and returns the ones that aren't. Returns: Type Description List[str] List of requirements required to run the pipeline that are not installed in the current environment. Source code in src/distilabel/mixins/requirements.py def requirements_to_install(self) -> List[str]:\n \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n Returns:\n List of requirements required to run the pipeline that are not installed in the current environment.\n \"\"\"\n\n to_install = []\n for req in self.requirements:\n requirement = Requirement(req)\n if importlib.util.find_spec(requirement.name):\n if (str(requirement.specifier) != \"\") and (\n version(requirement.name) != str(requirement.specifier)\n ):\n to_install.append(req)\n else:\n to_install.append(req)\n return to_install\n "},{"location":"api/mixins/runtime_parameters/","title":"RuntimeParametersMixin","text":""},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin","title":"RuntimeParametersMixin ","text":" Bases: BaseModel Mixin for classes that have RuntimeParameter s attributes. Attributes: Name Type Description _runtime_parameters Dict[str, Any] A dictionary containing the values of the runtime parameters of the class. This attribute is meant to be used internally and should not be accessed directly. Source code in src/distilabel/mixins/runtime_parameters.py class RuntimeParametersMixin(BaseModel):\n \"\"\"Mixin for classes that have `RuntimeParameter`s attributes.\n\n Attributes:\n _runtime_parameters: A dictionary containing the values of the runtime parameters\n of the class. This attribute is meant to be used internally and should not be\n accessed directly.\n \"\"\"\n\n _runtime_parameters: Dict[str, Any] = PrivateAttr(default_factory=dict)\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns a dictionary containing the name of the runtime parameters of the class\n as keys and whether the parameter is required or not as values.\n\n Returns:\n A dictionary containing the name of the runtime parameters of the class as keys\n and whether the parameter is required or not as values.\n \"\"\"\n\n runtime_parameters = {}\n\n for name, field_info in self.model_fields.items(): # type: ignore\n # `field: RuntimeParameter[Any]` or `field: Optional[RuntimeParameter[Any]]`\n is_runtime_param, is_optional = _is_runtime_parameter(field_info)\n if is_runtime_param:\n runtime_parameters[name] = is_optional\n continue\n\n attr = getattr(self, name)\n\n # `field: RuntimeParametersMixin`\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters[name] = attr.runtime_parameters_names\n\n # `field: List[RuntimeParametersMixin]`\n if (\n isinstance(attr, list)\n and attr\n and isinstance(attr[0], RuntimeParametersMixin)\n ):\n runtime_parameters[name] = {\n str(i): item.runtime_parameters_names for i, item in enumerate(attr)\n }\n\n return runtime_parameters\n\n def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the class such as the name and\n the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the class.\n\n Returns:\n A list containing the information for each runtime parameter of the class.\n \"\"\"\n runtime_parameters_info = []\n for name, field_info in self.model_fields.items(): # type: ignore\n if name not in self.runtime_parameters_names:\n continue\n\n attr = getattr(self, name)\n\n # Get runtime parameters info for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n }\n )\n continue\n\n # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": {\n str(i): item.get_runtime_parameters_info()\n for i, item in enumerate(attr)\n },\n }\n )\n continue\n\n info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n if field_info.description is not None:\n info[\"description\"] = field_info.description\n runtime_parameters_info.append(info)\n return runtime_parameters_info\n\n def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n the attr.\n\n Args:\n runtime_parameters: A dictionary containing the values of the runtime parameters\n to set.\n \"\"\"\n runtime_parameters_names = list(self.runtime_parameters_names.keys())\n for name, value in runtime_parameters.items():\n if name not in self.runtime_parameters_names:\n # Check done just to ensure the unit tests for the mixin run\n if getattr(self, \"pipeline\", None):\n closest = difflib.get_close_matches(\n name, runtime_parameters_names, cutoff=0.5\n )\n msg = (\n f\"\u26a0\ufe0f Runtime parameter '{name}' unknown in step '{self.name}'.\" # type: ignore\n )\n if closest:\n msg += f\" Did you mean any of: {closest}\"\n else:\n msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n self.pipeline._logger.warning(msg) # type: ignore\n continue\n\n attr = getattr(self, name)\n\n # Set runtime parameters for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n attr.set_runtime_parameters(value)\n self._runtime_parameters[name] = value\n continue\n\n # Set runtime parameters for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n for i, item in enumerate(attr):\n item_value = value.get(str(i), {})\n item.set_runtime_parameters(item_value)\n self._runtime_parameters[name] = value\n continue\n\n # Handle settings values for `_SecretField`\n field_info = self.model_fields[name]\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type):\n value = inner_type(value)\n\n # Set the value of the runtime parameter\n setattr(self, name, value)\n self._runtime_parameters[name] = value\n "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns a dictionary containing the name of the runtime parameters of the class as keys and whether the parameter is required or not as values. Returns: Type Description RuntimeParametersNames A dictionary containing the name of the runtime parameters of the class as keys RuntimeParametersNames and whether the parameter is required or not as values. "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Gets the information of the runtime parameters of the class such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the class. Returns: Type Description List[RuntimeParameterInfo] A list containing the information for each runtime parameter of the class. Source code in src/distilabel/mixins/runtime_parameters.py def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the class such as the name and\n the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the class.\n\n Returns:\n A list containing the information for each runtime parameter of the class.\n \"\"\"\n runtime_parameters_info = []\n for name, field_info in self.model_fields.items(): # type: ignore\n if name not in self.runtime_parameters_names:\n continue\n\n attr = getattr(self, name)\n\n # Get runtime parameters info for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n }\n )\n continue\n\n # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": {\n str(i): item.get_runtime_parameters_info()\n for i, item in enumerate(attr)\n },\n }\n )\n continue\n\n info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n if field_info.description is not None:\n info[\"description\"] = field_info.description\n runtime_parameters_info.append(info)\n return runtime_parameters_info\n "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.set_runtime_parameters","title":"set_runtime_parameters(runtime_parameters) ","text":"Sets the runtime parameters of the class using the provided values. If the attr to be set is a RuntimeParametersMixin , it will call set_runtime_parameters on the attr. Parameters: Name Type Description Default runtime_parameters Dict[str, Any] A dictionary containing the values of the runtime parameters to set. required Source code in src/distilabel/mixins/runtime_parameters.py def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n the attr.\n\n Args:\n runtime_parameters: A dictionary containing the values of the runtime parameters\n to set.\n \"\"\"\n runtime_parameters_names = list(self.runtime_parameters_names.keys())\n for name, value in runtime_parameters.items():\n if name not in self.runtime_parameters_names:\n # Check done just to ensure the unit tests for the mixin run\n if getattr(self, \"pipeline\", None):\n closest = difflib.get_close_matches(\n name, runtime_parameters_names, cutoff=0.5\n )\n msg = (\n f\"\u26a0\ufe0f Runtime parameter '{name}' unknown in step '{self.name}'.\" # type: ignore\n )\n if closest:\n msg += f\" Did you mean any of: {closest}\"\n else:\n msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n self.pipeline._logger.warning(msg) # type: ignore\n continue\n\n attr = getattr(self, name)\n\n # Set runtime parameters for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n attr.set_runtime_parameters(value)\n self._runtime_parameters[name] = value\n continue\n\n # Set runtime parameters for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n for i, item in enumerate(attr):\n item_value = value.get(str(i), {})\n item.set_runtime_parameters(item_value)\n self._runtime_parameters[name] = value\n continue\n\n # Handle settings values for `_SecretField`\n field_info = self.model_fields[name]\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type):\n value = inner_type(value)\n\n # Set the value of the runtime parameter\n setattr(self, name, value)\n self._runtime_parameters[name] = value\n "},{"location":"api/models/embedding/","title":"Embedding","text":"This section contains the API reference for the distilabel embeddings. For more information on how the Embeddings works and see some examples. "},{"location":"api/models/embedding/#distilabel.models.embeddings.base","title":"base ","text":""},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings","title":"Embeddings ","text":" Bases: RuntimeParametersMixin , BaseModel , _Serializable , ABC Base class for Embeddings models. To implement an Embeddings subclass, you need to subclass this class and implement: - load method to load the Embeddings model. Don't forget to call super().load() , so the _logger attribute is initialized. - model_name property to return the model name used for the Embeddings . - encode method to generate the sentence embeddings. Attributes: Name Type Description _logger Logger the logger to be used for the Embeddings model. It will be initialized when the load method is called. Source code in src/distilabel/models/embeddings/base.py class Embeddings(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n \"\"\"Base class for `Embeddings` models.\n\n To implement an `Embeddings` subclass, you need to subclass this class and implement:\n - `load` method to load the `Embeddings` model. Don't forget to call `super().load()`,\n so the `_logger` attribute is initialized.\n - `model_name` property to return the model name used for the `Embeddings`.\n - `encode` method to generate the sentence embeddings.\n\n Attributes:\n _logger: the logger to be used for the `Embeddings` model. It will be initialized\n when the `load` method is called.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n protected_namespaces=(),\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n def unload(self) -> None:\n \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n pass\n\n @property\n @abstractmethod\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the `Embeddings`.\"\"\"\n pass\n\n @abstractmethod\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n pass\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.model_name","title":"model_name: str abstractmethod property ","text":"Returns the model name used for the Embeddings . "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.load","title":"load() ","text":"Method to be called to initialize the Embeddings Source code in src/distilabel/models/embeddings/base.py def load(self) -> None:\n \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.unload","title":"unload() ","text":"Method to be called to unload the Embeddings and release any resources. Source code in src/distilabel/models/embeddings/base.py def unload(self) -> None:\n \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n pass\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.encode","title":"encode(inputs) abstractmethod ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/base.py @abstractmethod\ndef encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n pass\n "},{"location":"api/models/embedding/embedding_gallery/","title":"Embedding Gallery","text":"This section contains the existing Embeddings subclasses implemented in distilabel . "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings","title":"embeddings ","text":""},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings","title":"SentenceTransformerEmbeddings ","text":" Bases: Embeddings , CudaDevicePlacementMixin sentence-transformers library implementation for embedding generation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. device Optional[RuntimeParameter[str]] the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None . prompts Optional[Dict[str, str]] a dictionary containing prompts to be used with the model. Defaults to None . default_prompt_name Optional[str] the default prompt (in prompts ) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . revision Optional[str] if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . token Optional[str] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . truncate_dim Optional[int] the dimension to truncate the sentence embeddings. Defaults to None . model_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None . tokenizer_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None . config_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None . precision Optional[Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']] the dtype that will have the resulting embeddings. Defaults to \"float32\" . normalize_embeddings RuntimeParameter[bool] whether to normalize the embeddings so they have a length of 1. Defaults to None . Examples: Generating sentence embeddings: from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n Source code in src/distilabel/models/embeddings/sentence_transformers.py class SentenceTransformerEmbeddings(Embeddings, CudaDevicePlacementMixin):\n \"\"\"`sentence-transformers` library implementation for embedding generation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc.\n Defaults to `None`.\n prompts: a dictionary containing prompts to be used with the model. Defaults to\n `None`.\n default_prompt_name: the default prompt (in `prompts`) that will be applied to the\n inputs. If not provided, then no prompt will be used. Defaults to `None`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n truncate_dim: the dimension to truncate the sentence embeddings. Defaults to `None`.\n model_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n model class. Defaults to `None`.\n tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n tokenizer class. Defaults to `None`.\n config_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n configuration class. Defaults to `None`.\n precision: the dtype that will have the resulting embeddings. Defaults to `\"float32\"`.\n normalize_embeddings: whether to normalize the embeddings so they have a length\n of 1. Defaults to `None`.\n\n Examples:\n Generating sentence embeddings:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n\n embeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\n embeddings.load()\n\n results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n # [\n # [-0.05447685346007347, -0.01623094454407692, ...],\n # [4.4889533455716446e-05, 0.044016145169734955, ...],\n # ]\n ```\n \"\"\"\n\n model: str\n device: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The device to be used to load the model. If `None`, then it\"\n \" will check if a GPU can be used.\",\n )\n prompts: Optional[Dict[str, str]] = None\n default_prompt_name: Optional[str] = None\n trust_remote_code: bool = False\n revision: Optional[str] = None\n token: Optional[str] = None\n truncate_dim: Optional[int] = None\n model_kwargs: Optional[Dict[str, Any]] = None\n tokenizer_kwargs: Optional[Dict[str, Any]] = None\n config_kwargs: Optional[Dict[str, Any]] = None\n precision: Optional[Literal[\"float32\", \"int8\", \"uint8\", \"binary\", \"ubinary\"]] = (\n \"float32\"\n )\n normalize_embeddings: RuntimeParameter[bool] = Field(\n default=True,\n description=\"Whether to normalize the embeddings so the generated vectors\"\n \" have a length of 1 or not.\",\n )\n\n _model: Union[\"SentenceTransformer\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the Sentence Transformer model\"\"\"\n super().load()\n\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from sentence_transformers import SentenceTransformer\n except ImportError as e:\n raise ImportError(\n \"`sentence-transformers` package is not installed. Please install it using\"\n \" `pip install sentence-transformers`.\"\n ) from e\n\n self._model = SentenceTransformer(\n model_name_or_path=self.model,\n device=self.device,\n prompts=self.prompts,\n default_prompt_name=self.default_prompt_name,\n trust_remote_code=self.trust_remote_code,\n revision=self.revision,\n token=self.token,\n truncate_dim=self.truncate_dim,\n model_kwargs=self.model_kwargs,\n tokenizer_kwargs=self.tokenizer_kwargs,\n config_kwargs=self.config_kwargs,\n )\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the name of the model.\"\"\"\n return self.model\n\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return self._model.encode( # type: ignore\n sentences=inputs,\n batch_size=len(inputs),\n convert_to_numpy=True,\n precision=self.precision, # type: ignore\n normalize_embeddings=self.normalize_embeddings, # type: ignore\n ).tolist() # type: ignore\n\n def unload(self) -> None:\n del self._model\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.model_name","title":"model_name: str property ","text":"Returns the name of the model. "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.load","title":"load() ","text":"Loads the Sentence Transformer model Source code in src/distilabel/models/embeddings/sentence_transformers.py def load(self) -> None:\n \"\"\"Loads the Sentence Transformer model\"\"\"\n super().load()\n\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from sentence_transformers import SentenceTransformer\n except ImportError as e:\n raise ImportError(\n \"`sentence-transformers` package is not installed. Please install it using\"\n \" `pip install sentence-transformers`.\"\n ) from e\n\n self._model = SentenceTransformer(\n model_name_or_path=self.model,\n device=self.device,\n prompts=self.prompts,\n default_prompt_name=self.default_prompt_name,\n trust_remote_code=self.trust_remote_code,\n revision=self.revision,\n token=self.token,\n truncate_dim=self.truncate_dim,\n model_kwargs=self.model_kwargs,\n tokenizer_kwargs=self.tokenizer_kwargs,\n config_kwargs=self.config_kwargs,\n )\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.encode","title":"encode(inputs) ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/sentence_transformers.py def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return self._model.encode( # type: ignore\n sentences=inputs,\n batch_size=len(inputs),\n convert_to_numpy=True,\n precision=self.precision, # type: ignore\n normalize_embeddings=self.normalize_embeddings, # type: ignore\n ).tolist() # type: ignore\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings","title":"vLLMEmbeddings ","text":" Bases: Embeddings , CudaDevicePlacementMixin vllm library implementation for embedding generation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. dtype str the data type to use for the model. Defaults to auto . trust_remote_code bool whether to trust the remote code when loading the model. Defaults to False . quantization Optional[str] the quantization mode to use for the model. Defaults to None . revision Optional[str] the revision of the model to load. Defaults to None . enforce_eager bool whether to enforce eager execution. Defaults to True . seed int the seed to use for the random number generator. Defaults to 0 . extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . _model LLM the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. References - Offline inference embeddings
Examples: Generating sentence embeddings: from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n Source code in src/distilabel/models/embeddings/vllm.py class vLLMEmbeddings(Embeddings, CudaDevicePlacementMixin):\n \"\"\"`vllm` library implementation for embedding generation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n dtype: the data type to use for the model. Defaults to `auto`.\n trust_remote_code: whether to trust the remote code when loading the model. Defaults\n to `False`.\n quantization: the quantization mode to use for the model. Defaults to `None`.\n revision: the revision of the model to load. Defaults to `None`.\n enforce_eager: whether to enforce eager execution. Defaults to `True`.\n seed: the seed to use for the random number generator. Defaults to `0`.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `LLM` class of `vllm` library. Defaults to `{}`.\n _model: the `vLLM` model instance. This attribute is meant to be used internally\n and should not be accessed directly. It will be set in the `load` method.\n\n References:\n - [Offline inference embeddings](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_embedding.html)\n\n Examples:\n Generating sentence embeddings:\n\n ```python\n from distilabel.models import vLLMEmbeddings\n\n embeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\n embeddings.load()\n\n results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n # [\n # [-0.05447685346007347, -0.01623094454407692, ...],\n # [4.4889533455716446e-05, 0.044016145169734955, ...],\n # ]\n ```\n \"\"\"\n\n model: str\n dtype: str = \"auto\"\n trust_remote_code: bool = False\n quantization: Optional[str] = None\n revision: Optional[str] = None\n\n enforce_eager: bool = True\n\n seed: int = 0\n\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n )\n\n _model: \"_vLLM\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n enforce_eager=self.enforce_eager,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the name of the model.\"\"\"\n return self.model\n\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return [output.outputs.embedding for output in self._model.encode(inputs)]\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.model_name","title":"model_name: str property ","text":"Returns the name of the model. "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.load","title":"load() ","text":"Loads the vLLM model using either the path or the Hugging Face Hub repository id. Source code in src/distilabel/models/embeddings/vllm.py def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n enforce_eager=self.enforce_eager,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/embeddings/vllm.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.encode","title":"encode(inputs) ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/vllm.py def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return [output.outputs.embedding for output in self._model.encode(inputs)]\n "},{"location":"api/models/llm/","title":"LLM","text":"This section contains the API reference for the distilabel LLMs, both for the LLM synchronous implementation, and for the AsyncLLM asynchronous one. For more information and examples on how to use existing LLMs or create custom ones, please refer to Tutorial - LLM. "},{"location":"api/models/llm/#distilabel.models.llms.base","title":"base ","text":""},{"location":"api/models/llm/#distilabel.models.llms.base.LLM","title":"LLM ","text":" Bases: RuntimeParametersMixin , BaseModel , _Serializable , ABC Base class for LLM s to be used in distilabel framework. To implement an LLM subclass, you need to subclass this class and implement: - load method to load the LLM if needed. Don't forget to call super().load() , so the _logger attribute is initialized. - model_name property to return the model name used for the LLM. - generate method to generate num_generations per input in inputs . Attributes: Name Type Description generation_kwargs Optional[RuntimeParameter[Dict[str, Any]]] the kwargs to be propagated to either generate or agenerate methods within each LLM . use_offline_batch_generation Optional[RuntimeParameter[bool]] whether to use the offline_batch_generate method to generate the responses. offline_batch_generation_block_until_done Optional[RuntimeParameter[int]] if provided, then polling will be done until the ofline_batch_generate method is able to retrieve the results. The value indicate the time to wait between each polling. jobs_ids Union[Tuple[str, ...], None] the job ids generated by the offline_batch_generate method. This attribute is used to store the job ids generated by the offline_batch_generate method so later they can be used to retrieve the results. It is not meant to be set by the user. _logger Logger the logger to be used for the LLM . It will be initialized when the load method is called. Source code in src/distilabel/models/llms/base.py class LLM(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n \"\"\"Base class for `LLM`s to be used in `distilabel` framework.\n\n To implement an `LLM` subclass, you need to subclass this class and implement:\n - `load` method to load the `LLM` if needed. Don't forget to call `super().load()`,\n so the `_logger` attribute is initialized.\n - `model_name` property to return the model name used for the LLM.\n - `generate` method to generate `num_generations` per input in `inputs`.\n\n Attributes:\n generation_kwargs: the kwargs to be propagated to either `generate` or `agenerate`\n methods within each `LLM`.\n use_offline_batch_generation: whether to use the `offline_batch_generate` method to\n generate the responses.\n offline_batch_generation_block_until_done: if provided, then polling will be done until\n the `ofline_batch_generate` method is able to retrieve the results. The value indicate\n the time to wait between each polling.\n jobs_ids: the job ids generated by the `offline_batch_generate` method. This attribute\n is used to store the job ids generated by the `offline_batch_generate` method\n so later they can be used to retrieve the results. It is not meant to be set by\n the user.\n _logger: the logger to be used for the `LLM`. It will be initialized when the `load`\n method is called.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n protected_namespaces=(),\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n\n generation_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"The kwargs to be propagated to either `generate` or `agenerate`\"\n \" methods within each `LLM`.\",\n )\n use_offline_batch_generation: Optional[RuntimeParameter[bool]] = Field(\n default=False,\n description=\"Whether to use the `offline_batch_generate` method to generate\"\n \" the responses.\",\n )\n offline_batch_generation_block_until_done: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"If provided, then polling will be done until the `ofline_batch_generate`\"\n \" method is able to retrieve the results. The value indicate the time to wait between\"\n \" each polling.\",\n )\n\n jobs_ids: Union[Tuple[str, ...], None] = Field(default=None)\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n structured output generator.\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n def unload(self) -> None:\n \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n pass\n\n @property\n @abstractmethod\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n pass\n\n def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs to be used for the generation. This method can\n be overridden to provide a more complex logic for the generation kwargs.\n\n Returns:\n The kwargs to be used for the generation.\n \"\"\"\n return self.generation_kwargs # type: ignore\n\n @abstractmethod\n def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n per input in `inputs`.\n\n Args:\n inputs: the list of inputs to generate responses for which follows OpenAI's\n API format:\n\n ```python\n [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n ]\n ```\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n \"\"\"\n pass\n\n def generate_outputs(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Generates outputs for the given inputs using either `generate` method or the\n `offine_batch_generate` method if `use_offline_\n \"\"\"\n if self.use_offline_batch_generation:\n if self.offline_batch_generation_block_until_done is not None:\n return self._offline_batch_generate_polling(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n # if the batch generation is not finished.\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n\n def _offline_batch_generate_polling(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to poll the `offline_batch_generate` method until the batch generation\n is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n while True:\n try:\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n except DistilabelOfflineBatchGenerationNotFinishedException as e:\n self._logger.info(\n f\"Waiting for the offline batch generation to finish: {e}. Sleeping\"\n f\" for {self.offline_batch_generation_block_until_done} seconds before\"\n \" trying to get the results again.\"\n )\n # When running a `Step` in a child process, SIGINT is overriden so the child\n # process doesn't stop when the parent process receives a SIGINT signal.\n # The new handler sets an environment variable that is checked here to stop\n # the polling.\n if os.getenv(SIGINT_HANDLER_CALLED_ENV_NAME) is not None:\n self._logger.info(\n \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n \" offline batch generation is finished...\"\n )\n raise e\n time.sleep(self.offline_batch_generation_block_until_done) # type: ignore\n except KeyboardInterrupt as e:\n # This is for the case the `LLM` is being executed outside a pipeline\n self._logger.info(\n \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n \" offline batch generation is finished...\"\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids # type: ignore\n ) from e\n\n @property\n def generate_parameters(self) -> List[\"inspect.Parameter\"]:\n \"\"\"Returns the parameters of the `generate` method.\n\n Returns:\n A list containing the parameters of the `generate` method.\n \"\"\"\n return list(inspect.signature(self.generate).parameters.values())\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns the runtime parameters of the `LLM`, which are combination of the\n attributes of the `LLM` type hinted with `RuntimeParameter` and the parameters\n of the `generate` method that are not `input` and `num_generations`.\n\n Returns:\n A dictionary with the name of the runtime parameters as keys and a boolean\n indicating if the parameter is optional or not.\n \"\"\"\n runtime_parameters = super().runtime_parameters_names\n runtime_parameters[\"generation_kwargs\"] = {}\n\n # runtime parameters from the `generate` method\n for param in self.generate_parameters:\n if param.name in [\"input\", \"inputs\", \"num_generations\"]:\n continue\n is_optional = param.default != inspect.Parameter.empty\n runtime_parameters[\"generation_kwargs\"][param.name] = is_optional\n\n return runtime_parameters\n\n def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n and the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the `LLM`.\n\n Returns:\n A list containing the information for each runtime parameter of the `LLM`.\n \"\"\"\n runtime_parameters_info = super().get_runtime_parameters_info()\n\n generation_kwargs_info = next(\n (\n runtime_parameter_info\n for runtime_parameter_info in runtime_parameters_info\n if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n ),\n None,\n )\n\n # If `generation_kwargs` attribute is present, we need to include the `generate`\n # method arguments as the information for this attribute.\n if generation_kwargs_info:\n generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n generation_kwargs_info[\"keys\"] = []\n for key, value in generation_kwargs_info[\"optional\"].items():\n info = {\"name\": key, \"optional\": value}\n if description := generate_docstring_args.get(key):\n info[\"description\"] = description\n generation_kwargs_info[\"keys\"].append(info)\n\n generation_kwargs_info.pop(\"optional\")\n\n return runtime_parameters_info\n\n @cached_property\n def generate_parsed_docstring(self) -> \"Docstring\":\n \"\"\"Returns the parsed docstring of the `generate` method.\n\n Returns:\n The parsed docstring of the `generate` method.\n \"\"\"\n return parse_google_docstring(self.generate)\n\n def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n ) -> List[\"HiddenState\"]:\n \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n Args:\n inputs: the list of inputs to get the last hidden states from.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n )\n\n def _prepare_structured_output(\n self, structured_output: \"StructuredOutputType\"\n ) -> Union[Any, None]:\n \"\"\"Method in charge of preparing the structured output generator.\n\n By default will raise a `NotImplementedError`, subclasses that allow it must override this\n method with the implementation.\n\n Args:\n structured_output: the config to prepare the guided generation.\n\n Returns:\n The structure to be used for the guided generation.\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Guided generation is not implemented for `{type(self).__name__}`\"\n )\n\n def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n generation method to be implemented by each `LLM`.\n\n This method should create jobs the first time is called and store the job ids, so\n the second and subsequent calls can retrieve the results of the batch generation.\n If subsequent calls are made before the batch generation is finished, then the method\n should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n will be handled automatically by the `Pipeline` which will store all the required\n information for recovering the pipeline execution when the batch generation is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n raise DistilabelNotImplementedError(\n f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.model_name","title":"model_name: str abstractmethod property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property ","text":"Returns the parameters of the generate method. Returns: Type Description List[Parameter] A list containing the parameters of the generate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns the runtime parameters of the LLM , which are combination of the attributes of the LLM type hinted with RuntimeParameter and the parameters of the generate method that are not input and num_generations . Returns: Type Description RuntimeParametersNames A dictionary with the name of the runtime parameters as keys and a boolean RuntimeParametersNames indicating if the parameter is optional or not. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property ","text":"Returns the parsed docstring of the generate method. Returns: Type Description Docstring The parsed docstring of the generate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.load","title":"load() ","text":"Method to be called to initialize the LLM , its logger and optionally the structured output generator. Source code in src/distilabel/models/llms/base.py def load(self) -> None:\n \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n structured output generator.\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.unload","title":"unload() ","text":"Method to be called to unload the LLM and release any resources. Source code in src/distilabel/models/llms/base.py def unload(self) -> None:\n \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_generation_kwargs","title":"get_generation_kwargs() ","text":"Returns the generation kwargs to be used for the generation. This method can be overridden to provide a more complex logic for the generation kwargs. Returns: Type Description Dict[str, Any] The kwargs to be used for the generation. Source code in src/distilabel/models/llms/base.py def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs to be used for the generation. This method can\n be overridden to provide a more complex logic for the generation kwargs.\n\n Returns:\n The kwargs to be used for the generation.\n \"\"\"\n return self.generation_kwargs # type: ignore\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate","title":"generate(inputs, num_generations=1, **kwargs) abstractmethod ","text":"Abstract method to be implemented by each LLM to generate num_generations per input in inputs . Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for which follows OpenAI's API format: [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n]\n required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Source code in src/distilabel/models/llms/base.py @abstractmethod\ndef generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n per input in `inputs`.\n\n Args:\n inputs: the list of inputs to generate responses for which follows OpenAI's\n API format:\n\n ```python\n [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n ]\n ```\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n \"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_outputs","title":"generate_outputs(inputs, num_generations=1, **kwargs) ","text":"Generates outputs for the given inputs using either generate method or the offine_batch_generate method if `use_offline_ Source code in src/distilabel/models/llms/base.py def generate_outputs(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Generates outputs for the given inputs using either `generate` method or the\n `offine_batch_generate` method if `use_offline_\n \"\"\"\n if self.use_offline_batch_generation:\n if self.offline_batch_generation_block_until_done is not None:\n return self._offline_batch_generate_polling(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n # if the batch generation is not finished.\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Gets the information of the runtime parameters of the LLM such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the LLM . Returns: Type Description List[RuntimeParameterInfo] A list containing the information for each runtime parameter of the LLM . Source code in src/distilabel/models/llms/base.py def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n and the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the `LLM`.\n\n Returns:\n A list containing the information for each runtime parameter of the `LLM`.\n \"\"\"\n runtime_parameters_info = super().get_runtime_parameters_info()\n\n generation_kwargs_info = next(\n (\n runtime_parameter_info\n for runtime_parameter_info in runtime_parameters_info\n if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n ),\n None,\n )\n\n # If `generation_kwargs` attribute is present, we need to include the `generate`\n # method arguments as the information for this attribute.\n if generation_kwargs_info:\n generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n generation_kwargs_info[\"keys\"] = []\n for key, value in generation_kwargs_info[\"optional\"].items():\n info = {\"name\": key, \"optional\": value}\n if description := generate_docstring_args.get(key):\n info[\"description\"] = description\n generation_kwargs_info[\"keys\"].append(info)\n\n generation_kwargs_info.pop(\"optional\")\n\n return runtime_parameters_info\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_last_hidden_states","title":"get_last_hidden_states(inputs) ","text":"Method to get the last hidden states of the model for a list of inputs. Parameters: Name Type Description Default inputs List[StandardInput] the list of inputs to get the last hidden states from. required Returns: Type Description List[HiddenState] A list containing the last hidden state for each sequence using a NumPy array with shape [num_tokens, hidden_size]. Source code in src/distilabel/models/llms/base.py def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n Args:\n inputs: the list of inputs to get the last hidden states from.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, **kwargs) ","text":"Method to generate a list of outputs for the given inputs using an offline batch generation method to be implemented by each LLM . This method should create jobs the first time is called and store the job ids, so the second and subsequent calls can retrieve the results of the batch generation. If subsequent calls are made before the batch generation is finished, then the method should raise a DistilabelOfflineBatchGenerationNotFinishedException . This exception will be handled automatically by the Pipeline which will store all the required information for recovering the pipeline execution when the batch generation is finished. Parameters: Name Type Description Default inputs Union[List[FormattedInput], None] the list of inputs to generate responses for. None num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/base.py def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n generation method to be implemented by each `LLM`.\n\n This method should create jobs the first time is called and store the job ids, so\n the second and subsequent calls can retrieve the results of the batch generation.\n If subsequent calls are made before the batch generation is finished, then the method\n should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n will be handled automatically by the `Pipeline` which will store all the required\n information for recovering the pipeline execution when the batch generation is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n raise DistilabelNotImplementedError(\n f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM","title":"AsyncLLM ","text":" Bases: LLM Abstract class for asynchronous LLMs, so as to benefit from the async capabilities of each LLM implementation. This class is meant to be subclassed by each LLM, and the method agenerate needs to be implemented to provide the asynchronous generation of responses. Attributes: Name Type Description _event_loop AbstractEventLoop the event loop to be used for the asynchronous generation of responses. Source code in src/distilabel/models/llms/base.py class AsyncLLM(LLM):\n \"\"\"Abstract class for asynchronous LLMs, so as to benefit from the async capabilities\n of each LLM implementation. This class is meant to be subclassed by each LLM, and the\n method `agenerate` needs to be implemented to provide the asynchronous generation of\n responses.\n\n Attributes:\n _event_loop: the event loop to be used for the asynchronous generation of responses.\n \"\"\"\n\n _num_generations_param_supported = True\n _event_loop: \"asyncio.AbstractEventLoop\" = PrivateAttr(default=None)\n _new_event_loop: bool = PrivateAttr(default=False)\n\n @property\n def generate_parameters(self) -> List[inspect.Parameter]:\n \"\"\"Returns the parameters of the `agenerate` method.\n\n Returns:\n A list containing the parameters of the `agenerate` method.\n \"\"\"\n return list(inspect.signature(self.agenerate).parameters.values())\n\n @cached_property\n def generate_parsed_docstring(self) -> \"Docstring\":\n \"\"\"Returns the parsed docstring of the `agenerate` method.\n\n Returns:\n The parsed docstring of the `agenerate` method.\n \"\"\"\n return parse_google_docstring(self.agenerate)\n\n @property\n def event_loop(self) -> \"asyncio.AbstractEventLoop\":\n if self._event_loop is None:\n try:\n self._event_loop = asyncio.get_running_loop()\n if self._event_loop.is_closed():\n self._event_loop = asyncio.new_event_loop() # type: ignore\n self._new_event_loop = True\n except RuntimeError:\n self._event_loop = asyncio.new_event_loop()\n self._new_event_loop = True\n asyncio.set_event_loop(self._event_loop)\n return self._event_loop\n\n @abstractmethod\n async def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n ) -> \"GenerateOutput\":\n \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n and executed concurrently in `generate` method.\n \"\"\"\n pass\n\n async def _agenerate(\n self, inputs: List[\"FormattedInput\"], num_generations: int = 1, **kwargs: Any\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n if self._num_generations_param_supported:\n tasks = [\n asyncio.create_task(\n self.agenerate(\n input=input, num_generations=num_generations, **kwargs\n )\n )\n for input in inputs\n ]\n result = await asyncio.gather(*tasks)\n return result\n\n tasks = [\n asyncio.create_task(self.agenerate(input=input, **kwargs))\n for input in inputs\n for _ in range(num_generations)\n ]\n outputs = await asyncio.gather(*tasks)\n return merge_responses(outputs, n=num_generations)\n\n def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of responses asynchronously, returning the output\n synchronously awaiting for the response of each input sent to `agenerate`.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n return self.event_loop.run_until_complete(\n self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n )\n\n def __del__(self) -> None:\n \"\"\"Closes the event loop when the object is deleted.\"\"\"\n if sys.meta_path is None:\n return\n\n if self._new_event_loop:\n if self._event_loop.is_running():\n self._event_loop.stop()\n self._event_loop.close()\n\n @staticmethod\n def _prepare_structured_output( # type: ignore\n structured_output: \"InstructorStructuredOutputType\",\n client: Any = None,\n framework: Optional[str] = None,\n ) -> Dict[str, Union[str, Any]]:\n \"\"\"Wraps the client and updates the schema to work store it internally as a json schema.\n\n Args:\n structured_output: The configuration dict to prepare the structured output.\n client: The client to wrap to generate structured output. Implemented to work\n with `instructor`.\n framework: The name of the framework.\n\n Returns:\n A dictionary containing the wrapped client and the schema to update the structured_output\n variable in case it is a pydantic model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.instructor import (\n prepare_instructor,\n )\n\n result = {}\n client = prepare_instructor(\n client,\n mode=structured_output.get(\"mode\"),\n framework=framework, # type: ignore\n )\n result[\"client\"] = client\n\n schema = structured_output.get(\"schema\")\n if not schema:\n raise DistilabelUserError(\n f\"The `structured_output` argument must contain a schema: {structured_output}\",\n page=\"sections/how_to_guides/advanced/structured_generation/#instructor\",\n )\n if inspect.isclass(schema) and issubclass(schema, BaseModel):\n # We want a json schema for the serialization, but instructor wants a pydantic BaseModel.\n structured_output[\"schema\"] = schema.model_json_schema() # type: ignore\n result[\"structured_output\"] = structured_output\n\n return result\n\n @staticmethod\n def _prepare_kwargs(\n arguments: Dict[str, Any], structured_output: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Helper method to update the kwargs with the structured output configuration,\n used in case they are defined.\n\n Args:\n arguments: The arguments that would be passed to the LLM as **kwargs.\n to update with the structured output configuration.\n structured_outputs: The structured output configuration to update the arguments.\n\n Returns:\n kwargs updated with the special arguments used by `instructor`.\n \"\"\"\n # We can deal with json schema or BaseModel, but we need to convert it to a BaseModel\n # for the Instructor client.\n schema = structured_output.get(\"schema\", {})\n\n # If there's already a pydantic model, we don't need to do anything,\n # otherwise, try to obtain one.\n if not (inspect.isclass(schema) and issubclass(schema, BaseModel)):\n from distilabel.steps.tasks.structured_outputs.utils import (\n json_schema_to_model,\n )\n\n if isinstance(schema, str):\n # In case it was saved in the dataset as a string.\n schema = json.loads(schema)\n\n try:\n schema = json_schema_to_model(schema)\n except Exception as e:\n raise ValueError(\n f\"Failed to convert the schema to a pydantic model, the model is too complex currently: {e}\"\n ) from e\n\n arguments.update(\n **{\n \"response_model\": schema,\n \"max_retries\": structured_output.get(\"max_retries\", 1),\n },\n )\n return arguments\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property ","text":"Returns the parameters of the agenerate method. Returns: Type Description List[Parameter] A list containing the parameters of the agenerate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property ","text":"Returns the parsed docstring of the agenerate method. Returns: Type Description Docstring The parsed docstring of the agenerate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.agenerate","title":"agenerate(input, num_generations=1, **kwargs) abstractmethod async ","text":"Method to generate a num_generations responses for a given input asynchronously, and executed concurrently in generate method. Source code in src/distilabel/models/llms/base.py @abstractmethod\nasync def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n) -> \"GenerateOutput\":\n \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n and executed concurrently in `generate` method.\n \"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate","title":"generate(inputs, num_generations=1, **kwargs) ","text":"Method to generate a list of responses asynchronously, returning the output synchronously awaiting for the response of each input sent to agenerate . Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for. required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/base.py def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of responses asynchronously, returning the output\n synchronously awaiting for the response of each input sent to `agenerate`.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n return self.event_loop.run_until_complete(\n self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.__del__","title":"__del__() ","text":"Closes the event loop when the object is deleted. Source code in src/distilabel/models/llms/base.py def __del__(self) -> None:\n \"\"\"Closes the event loop when the object is deleted.\"\"\"\n if sys.meta_path is None:\n return\n\n if self._new_event_loop:\n if self._event_loop.is_running():\n self._event_loop.stop()\n self._event_loop.close()\n "},{"location":"api/models/llm/#distilabel.models.llms.base.merge_responses","title":"merge_responses(responses, n=1) ","text":"Helper function to group the responses from LLM.agenerate method according to the number of generations requested. Parameters: Name Type Description Default responses List[GenerateOutput] the responses from the LLM.agenerate method. required n int number of responses to group together. Defaults to 1. 1 Returns: Type Description List[GenerateOutput] List of merged responses, where each merged response contains n generations List[GenerateOutput] and their corresponding statistics. Source code in src/distilabel/models/llms/base.py def merge_responses(\n responses: List[\"GenerateOutput\"], n: int = 1\n) -> List[\"GenerateOutput\"]:\n \"\"\"Helper function to group the responses from `LLM.agenerate` method according\n to the number of generations requested.\n\n Args:\n responses: the responses from the `LLM.agenerate` method.\n n: number of responses to group together. Defaults to 1.\n\n Returns:\n List of merged responses, where each merged response contains n generations\n and their corresponding statistics.\n \"\"\"\n if not responses:\n return []\n\n def chunks(lst, n):\n \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n for i in range(0, len(lst), n):\n yield list(islice(lst, i, i + n))\n\n extra_keys = [\n key for key in responses[0].keys() if key not in (\"generations\", \"statistics\")\n ]\n\n result = []\n for group in chunks(responses, n):\n merged = {\n \"generations\": [],\n \"statistics\": {\"input_tokens\": [], \"output_tokens\": []},\n }\n for response in group:\n merged[\"generations\"].append(response[\"generations\"][0])\n # Merge statistics\n for key in response[\"statistics\"]:\n if key not in merged[\"statistics\"]:\n merged[\"statistics\"][key] = []\n merged[\"statistics\"][key].append(response[\"statistics\"][key][0])\n # Merge extra keys returned by the `LLM`\n for extra_key in extra_keys:\n if extra_key not in merged:\n merged[extra_key] = []\n merged[extra_key].append(response[extra_key][0])\n result.append(merged)\n return result\n "},{"location":"api/models/llm/llm_gallery/","title":"LLM Gallery","text":"This section contains the existing LLM subclasses implemented in distilabel . "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms","title":"llms ","text":""},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM","title":"AnthropicLLM ","text":" Bases: AsyncLLM Anthropic LLM implementation running the Async API client. Attributes: Name Type Description model str the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. base_url Optional[RuntimeParameter[str]] the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally. timeout RuntimeParameter[float] the maximum time in seconds to wait for a response. Defaults to 600.0 . max_retries RuntimeParameter[int] The maximum number of times to retry the request before failing. Defaults to 6 . http_client Optional[AsyncClient] if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. _aclient Optional[AsyncAnthropic] the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method. Runtime parameters api_key : the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. base_url : the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\" . timeout : the maximum time in seconds to wait for a response. Defaults to 600.0 . max_retries : the maximum number of times to retry the request before failing. Defaults to 6 . Examples: Generate text: from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/anthropic.py class AnthropicLLM(AsyncLLM):\n \"\"\"Anthropic LLM implementation running the Async API client.\n\n Attributes:\n model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\",\n \"claude-3-sonnet-20240229\", etc. Available models can be checked here:\n [Anthropic: Models overview](https://docs.anthropic.com/claude/docs/models-overview).\n api_key: the API key to authenticate the requests to the Anthropic API. If not provided,\n it will be read from `ANTHROPIC_API_KEY` environment variable.\n base_url: the base URL to use for the Anthropic API. Defaults to `None` which means\n that `https://api.anthropic.com` will be used internally.\n timeout: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n max_retries: The maximum number of times to retry the request before failing. Defaults\n to `6`.\n http_client: if provided, an alternative HTTP client to use for calling Anthropic\n API. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key. It\n is meant to be used internally.\n _aclient: the `AsyncAnthropic` client to use for the Anthropic API. It is meant\n to be used internally. Set in the `load` method.\n\n Runtime parameters:\n - `api_key`: the API key to authenticate the requests to the Anthropic API. If not\n provided, it will be read from `ANTHROPIC_API_KEY` environment variable.\n - `base_url`: the base URL to use for the Anthropic API. Defaults to `\"https://api.anthropic.com\"`.\n - `timeout`: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n - `max_retries`: the maximum number of times to retry the request before failing.\n Defaults to `6`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnthropicLLM\n\n llm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import AnthropicLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\"\n ),\n description=\"The base URL to use for the Anthropic API.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ANTHROPIC_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Anthropic API.\",\n )\n timeout: RuntimeParameter[float] = Field(\n default=600.0,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n http_client: Optional[AsyncClient] = Field(default=None, exclude=True)\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(default=_ANTHROPIC_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncAnthropic\"] = PrivateAttr(...)\n\n def _check_model_exists(self) -> None:\n \"\"\"Checks if the specified model exists in the available models.\"\"\"\n from anthropic import AsyncAnthropic\n\n annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n models = [\n value\n for type_ in get_args(annotation)\n if get_origin(type_) is Literal\n for value in get_args(type_)\n ]\n\n if self.model not in models:\n raise ValueError(\n f\"Model {self.model} does not exist among available models. \"\n f\"The available models are {', '.join(models)}\"\n )\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n super().load()\n\n try:\n from anthropic import AsyncAnthropic\n except ImportError as ie:\n raise ImportError(\n \"Anthropic Python client is not installed. Please install it using\"\n \" `pip install anthropic`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._check_model_exists()\n\n self._aclient = AsyncAnthropic(\n api_key=self.api_key.get_secret_value(),\n base_url=self.base_url,\n timeout=self.timeout,\n http_client=self.http_client,\n max_retries=self.max_retries,\n )\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_tokens: int = 128,\n stop_sequences: Union[List[str], None] = None,\n temperature: float = 1.0,\n top_p: Union[float, None] = None,\n top_k: Union[int, None] = None,\n ) -> GenerateOutput:\n \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n Args:\n input: a single input in chat format to generate responses for.\n max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from anthropic._types import NOT_GIVEN\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"system\": (\n input.pop(0)[\"content\"]\n if input and input[0][\"role\"] == \"system\"\n else NOT_GIVEN\n ),\n \"max_tokens\": max_tokens,\n \"stream\": False,\n \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n \"temperature\": temperature,\n \"top_p\": NOT_GIVEN if top_p is None else top_p,\n \"top_k\": NOT_GIVEN if top_k is None else top_k,\n }\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion: Union[\"Message\", \"BaseModel\"] = await self._aclient.messages.create(\n **kwargs\n ) # type: ignore\n if structured_output:\n # raw_response = completion._raw_response\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n if (content := completion.content[0].text) is None:\n self._logger.warning(\n f\"Received no response using Anthropic client (model: '{self.model}').\"\n f\" Finish reason was: {completion.stop_reason}\"\n )\n return prepare_output([content], **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"Message\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.input_tokens],\n \"output_tokens\": [completion.usage.output_tokens],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM._check_model_exists","title":"_check_model_exists() ","text":"Checks if the specified model exists in the available models. Source code in src/distilabel/models/llms/anthropic.py def _check_model_exists(self) -> None:\n \"\"\"Checks if the specified model exists in the available models.\"\"\"\n from anthropic import AsyncAnthropic\n\n annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n models = [\n value\n for type_ in get_args(annotation)\n if get_origin(type_) is Literal\n for value in get_args(type_)\n ]\n\n if self.model not in models:\n raise ValueError(\n f\"Model {self.model} does not exist among available models. \"\n f\"The available models are {', '.join(models)}\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.load","title":"load() ","text":"Loads the AsyncAnthropic client to use the Anthropic async API. Source code in src/distilabel/models/llms/anthropic.py def load(self) -> None:\n \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n super().load()\n\n try:\n from anthropic import AsyncAnthropic\n except ImportError as ie:\n raise ImportError(\n \"Anthropic Python client is not installed. Please install it using\"\n \" `pip install anthropic`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._check_model_exists()\n\n self._aclient = AsyncAnthropic(\n api_key=self.api_key.get_secret_value(),\n base_url=self.base_url,\n timeout=self.timeout,\n http_client=self.http_client,\n max_retries=self.max_retries,\n )\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.agenerate","title":"agenerate(input, max_tokens=128, stop_sequences=None, temperature=1.0, top_p=None, top_k=None) async ","text":"Generates a response asynchronously, using the Anthropic Async API definition. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 stop_sequences Union[List[str], None] custom text sequences that will cause the model to stop generating. Defaults to NOT_GIVEN . None temperature float the temperature to use for the generation. Set only if top_p is None. Defaults to 1.0 . 1.0 top_p Union[float, None] the top-p value to use for the generation. Defaults to NOT_GIVEN . None top_k Union[int, None] the top-k value to use for the generation. Defaults to NOT_GIVEN . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/anthropic.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_tokens: int = 128,\n stop_sequences: Union[List[str], None] = None,\n temperature: float = 1.0,\n top_p: Union[float, None] = None,\n top_k: Union[int, None] = None,\n) -> GenerateOutput:\n \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n Args:\n input: a single input in chat format to generate responses for.\n max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from anthropic._types import NOT_GIVEN\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"system\": (\n input.pop(0)[\"content\"]\n if input and input[0][\"role\"] == \"system\"\n else NOT_GIVEN\n ),\n \"max_tokens\": max_tokens,\n \"stream\": False,\n \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n \"temperature\": temperature,\n \"top_p\": NOT_GIVEN if top_p is None else top_p,\n \"top_k\": NOT_GIVEN if top_k is None else top_k,\n }\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion: Union[\"Message\", \"BaseModel\"] = await self._aclient.messages.create(\n **kwargs\n ) # type: ignore\n if structured_output:\n # raw_response = completion._raw_response\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n if (content := completion.content[0].text) is None:\n self._logger.warning(\n f\"Received no response using Anthropic client (model: '{self.model}').\"\n f\" Finish reason was: {completion.stop_reason}\"\n )\n return prepare_output([content], **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnyscaleLLM","title":"AnyscaleLLM ","text":" Bases: OpenAILLM Anyscale LLM implementation running the async API client of OpenAI. Attributes: Name Type Description model the model name to use for the LLM, e.g., google/gemma-7b-it . See the supported models under the \"Text Generation -> Supported Models\" section here. base_url Optional[RuntimeParameter[str]] the base URL to use for the Anyscale API requests. Defaults to None , which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set. _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. Examples: Generate text: from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/anyscale.py class AnyscaleLLM(OpenAILLM):\n \"\"\"Anyscale LLM implementation running the async API client of OpenAI.\n\n Attributes:\n model: the model name to use for the LLM, e.g., `google/gemma-7b-it`. See the\n supported models under the \"Text Generation -> Supported Models\" section\n [here](https://docs.endpoints.anyscale.com/).\n base_url: the base URL to use for the Anyscale API requests. Defaults to `None`, which\n means that the value set for the environment variable `ANYSCALE_BASE_URL` will be used, or\n \"https://api.endpoints.anyscale.com/v1\" if not set.\n api_key: the API key to authenticate the requests to the Anyscale API. Defaults to `None` which\n means that the value set for the environment variable `ANYSCALE_API_KEY` will be used, or\n `None` if not set.\n _api_key_env_var: the name of the environment variable to use for the API key.\n It is meant to be used internally.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnyscaleLLM\n\n llm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"ANYSCALE_BASE_URL\", \"https://api.endpoints.anyscale.com/v1\"\n ),\n description=\"The base URL to use for the Anyscale API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ANYSCALE_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Anyscale API.\",\n )\n\n _api_key_env_var: str = PrivateAttr(_ANYSCALE_API_KEY_ENV_VAR_NAME)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM","title":"AzureOpenAILLM ","text":" Bases: OpenAILLM Azure OpenAI LLM implementation running the async API client. Attributes: Name Type Description model the model name to use for the LLM i.e. the name of the Azure deployment. base_url Optional[RuntimeParameter[str]] the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT . Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set. api_version Optional[RuntimeParameter[str]] the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set. Icon :material-microsoft-azure: Examples: Generate text: from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate text from a custom endpoint following the OpenAI API: from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/azure.py class AzureOpenAILLM(OpenAILLM):\n \"\"\"Azure OpenAI LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM i.e. the name of the Azure deployment.\n base_url: the base URL to use for the Azure OpenAI API can be set with `AZURE_OPENAI_ENDPOINT`.\n Defaults to `None` which means that the value set for the environment variable\n `AZURE_OPENAI_ENDPOINT` will be used, or `None` if not set.\n api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to `None`\n which means that the value set for the environment variable `AZURE_OPENAI_API_KEY` will be\n used, or `None` if not set.\n api_version: the API version to use for the Azure OpenAI API. Defaults to `None` which means\n that the value set for the environment variable `OPENAI_API_VERSION` will be used, or\n `None` if not set.\n\n Icon:\n `:material-microsoft-azure:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AzureOpenAILLM\n\n llm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate text from a custom endpoint following the OpenAI API:\n\n ```python\n from distilabel.models.llms import AzureOpenAILLM\n\n llm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import AzureOpenAILLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME),\n description=\"The base URL to use for the Azure OpenAI API requests i.e. the Azure OpenAI endpoint.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Azure OpenAI API.\",\n )\n\n api_version: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\"OPENAI_API_VERSION\"),\n description=\"The API version to use for the Azure OpenAI API.\",\n )\n\n _base_url_env_var: str = PrivateAttr(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME)\n _api_key_env_var: str = PrivateAttr(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncAzureOpenAI\"] = PrivateAttr(...) # type: ignore\n\n @override\n def load(self) -> None:\n \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n # in the load method before we have the proper client.\n with patch(\n \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n ):\n super().load()\n\n try:\n from openai import AsyncAzureOpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n # TODO: May be worth adding the AD auth too? Also the `organization`?\n self._aclient = AsyncAzureOpenAI( # type: ignore\n azure_endpoint=self.base_url, # type: ignore\n azure_deployment=self.model,\n api_version=self.api_version,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n self._prepare_structured_output(self.structured_output)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM.load","title":"load() ","text":"Loads the AsyncAzureOpenAI client to benefit from async requests. Source code in src/distilabel/models/llms/azure.py @override\ndef load(self) -> None:\n \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n # in the load method before we have the proper client.\n with patch(\n \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n ):\n super().load()\n\n try:\n from openai import AsyncAzureOpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n # TODO: May be worth adding the AD auth too? Also the `organization`?\n self._aclient = AsyncAzureOpenAI( # type: ignore\n azure_endpoint=self.base_url, # type: ignore\n azure_deployment=self.model,\n api_version=self.api_version,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n self._prepare_structured_output(self.structured_output)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM","title":"CohereLLM ","text":" Bases: AsyncLLM Cohere API implementation using the async client for concurrent text generation. Attributes: Name Type Description model str the name of the model from the Cohere API to use for the generation. base_url Optional[RuntimeParameter[str]] the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . client_name RuntimeParameter[str] the name of the client to use for the API requests. Defaults to \"distilabel\" . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _ChatMessage Type[ChatMessage] the ChatMessage class from the cohere package. _aclient AsyncClient the AsyncClient client from the cohere package. Runtime parameters base_url : the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . api_key : the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . client_name : the name of the client to use for the API requests. Defaults to \"distilabel\" . Examples: Generate text: from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import CohereLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = CohereLLM(\n model=\"CohereForAI/c4ai-command-r-plus\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/cohere.py class CohereLLM(AsyncLLM):\n \"\"\"Cohere API implementation using the async client for concurrent text generation.\n\n Attributes:\n model: the name of the model from the Cohere API to use for the generation.\n base_url: the base URL to use for the Cohere API requests. Defaults to\n `\"https://api.cohere.ai/v1\"`.\n api_key: the API key to authenticate the requests to the Cohere API. Defaults to\n the value of the `COHERE_API_KEY` environment variable.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n client_name: the name of the client to use for the API requests. Defaults to\n `\"distilabel\"`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _ChatMessage: the `ChatMessage` class from the `cohere` package.\n _aclient: the `AsyncClient` client from the `cohere` package.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the Cohere API requests. Defaults to\n `\"https://api.cohere.ai/v1\"`.\n - `api_key`: the API key to authenticate the requests to the Cohere API. Defaults\n to the value of the `COHERE_API_KEY` environment variable.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n - `client_name`: the name of the client to use for the API requests. Defaults to\n `\"distilabel\"`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import CohereLLM\n\n llm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import CohereLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = CohereLLM(\n model=\"CohereForAI/c4ai-command-r-plus\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"COHERE_BASE_URL\", \"https://api.cohere.ai/v1\"\n ),\n description=\"The base URL to use for the Cohere API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_COHERE_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Cohere API.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n client_name: RuntimeParameter[str] = Field(\n default=\"distilabel\",\n description=\"The name of the client to use for the API requests.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _ChatMessage: Type[\"ChatMessage\"] = PrivateAttr(...)\n _aclient: \"AsyncClient\" = PrivateAttr(...)\n _tokenizer: \"Tokenizer\" = PrivateAttr(...)\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n super().load()\n\n try:\n from cohere import AsyncClient, ChatMessage\n except ImportError as ie:\n raise ImportError(\n \"The `cohere` package is required to use the `CohereLLM` class.\"\n ) from ie\n\n self._ChatMessage = ChatMessage\n\n self._aclient = AsyncClient(\n api_key=self.api_key.get_secret_value(), # type: ignore\n client_name=self.client_name,\n base_url=self.base_url,\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n from cohere.manually_maintained.tokenizers import get_hf_tokenizer\n\n self._tokenizer: \"Tokenizer\" = get_hf_tokenizer(self._aclient, self.model)\n\n def _format_chat_to_cohere(\n self, input: \"FormattedInput\"\n ) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n Args:\n input: The chat input to format.\n\n Returns:\n A tuple containing the system, chat history, and message.\n \"\"\"\n system = None\n message = None\n chat_history = []\n for item in input:\n role = item[\"role\"]\n content = item[\"content\"]\n if role == \"system\":\n system = content\n elif role == \"user\":\n message = content\n elif role == \"assistant\":\n if message is None:\n raise ValueError(\n \"An assistant message but be preceded by a user message.\"\n )\n chat_history.append(self._ChatMessage(role=\"USER\", message=message)) # type: ignore\n chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content)) # type: ignore\n message = None\n\n if message is None:\n raise ValueError(\"The chat input must end with a user message.\")\n\n return system, chat_history, message\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n temperature: Optional[float] = None,\n max_tokens: Optional[int] = None,\n k: Optional[int] = None,\n p: Optional[float] = None,\n seed: Optional[float] = None,\n stop_sequences: Optional[Sequence[str]] = None,\n frequency_penalty: Optional[float] = None,\n presence_penalty: Optional[float] = None,\n raw_prompting: Optional[bool] = None,\n ) -> GenerateOutput:\n \"\"\"Generates a response from the LLM given an input.\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: the temperature to use for the generation. Defaults to `None`.\n max_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `None`.\n k: the number of highest probability vocabulary tokens to keep for the generation.\n Defaults to `None`.\n p: the nucleus sampling probability to use for the generation. Defaults to\n `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: a list of sequences to use as stopping criteria for the generation.\n Defaults to `None`.\n frequency_penalty: the frequency penalty to use for the generation. Defaults\n to `None`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `None`.\n raw_prompting: a flag to use raw prompting for the generation. Defaults to\n `None`.\n\n Returns:\n The generated response from the Cohere API model.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n system, chat_history, message = self._format_chat_to_cohere(input)\n\n kwargs = {\n \"message\": message,\n \"model\": self.model,\n \"preamble\": system,\n \"chat_history\": chat_history,\n \"temperature\": temperature,\n \"max_tokens\": max_tokens,\n \"k\": k,\n \"p\": p,\n \"seed\": seed,\n \"stop_sequences\": stop_sequences,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"raw_prompting\": raw_prompting,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n response: Union[\"Message\", \"BaseModel\"] = await self._aclient.chat(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [response.model_dump_json()],\n **self._get_llm_statistics(\n input, orjson.dumps(response.model_dump_json()).decode(\"utf-8\")\n ), # type: ignore\n )\n\n if (text := response.text) == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response using Cohere client (model: '{self.model}').\"\n f\" Finish reason was: {response.finish_reason}\"\n )\n return prepare_output(\n [None],\n **self._get_llm_statistics(input, \"\"),\n )\n\n return prepare_output(\n [text],\n **self._get_llm_statistics(input, text),\n )\n\n def _get_llm_statistics(\n self, input: FormattedInput, output: str\n ) -> \"LLMStatistics\":\n return {\n \"input_tokens\": [compute_tokens(input, self._tokenizer.encode)],\n \"output_tokens\": [compute_tokens(output, self._tokenizer.encode)],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.load","title":"load() ","text":"Loads the AsyncClient client from the cohere package. Source code in src/distilabel/models/llms/cohere.py def load(self) -> None:\n \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n super().load()\n\n try:\n from cohere import AsyncClient, ChatMessage\n except ImportError as ie:\n raise ImportError(\n \"The `cohere` package is required to use the `CohereLLM` class.\"\n ) from ie\n\n self._ChatMessage = ChatMessage\n\n self._aclient = AsyncClient(\n api_key=self.api_key.get_secret_value(), # type: ignore\n client_name=self.client_name,\n base_url=self.base_url,\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n from cohere.manually_maintained.tokenizers import get_hf_tokenizer\n\n self._tokenizer: \"Tokenizer\" = get_hf_tokenizer(self._aclient, self.model)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM._format_chat_to_cohere","title":"_format_chat_to_cohere(input) ","text":"Formats the chat input to the Cohere Chat API conversational format. Parameters: Name Type Description Default input FormattedInput The chat input to format. required Returns: Type Description Tuple[Union[str, None], List[ChatMessage], str] A tuple containing the system, chat history, and message. Source code in src/distilabel/models/llms/cohere.py def _format_chat_to_cohere(\n self, input: \"FormattedInput\"\n) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n Args:\n input: The chat input to format.\n\n Returns:\n A tuple containing the system, chat history, and message.\n \"\"\"\n system = None\n message = None\n chat_history = []\n for item in input:\n role = item[\"role\"]\n content = item[\"content\"]\n if role == \"system\":\n system = content\n elif role == \"user\":\n message = content\n elif role == \"assistant\":\n if message is None:\n raise ValueError(\n \"An assistant message but be preceded by a user message.\"\n )\n chat_history.append(self._ChatMessage(role=\"USER\", message=message)) # type: ignore\n chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content)) # type: ignore\n message = None\n\n if message is None:\n raise ValueError(\"The chat input must end with a user message.\")\n\n return system, chat_history, message\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.agenerate","title":"agenerate(input, temperature=None, max_tokens=None, k=None, p=None, seed=None, stop_sequences=None, frequency_penalty=None, presence_penalty=None, raw_prompting=None) async ","text":"Generates a response from the LLM given an input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required temperature Optional[float] the temperature to use for the generation. Defaults to None . None max_tokens Optional[int] the maximum number of new tokens that the model will generate. Defaults to None . None k Optional[int] the number of highest probability vocabulary tokens to keep for the generation. Defaults to None . None p Optional[float] the nucleus sampling probability to use for the generation. Defaults to None . None seed Optional[float] the seed to use for the generation. Defaults to None . None stop_sequences Optional[Sequence[str]] a list of sequences to use as stopping criteria for the generation. Defaults to None . None frequency_penalty Optional[float] the frequency penalty to use for the generation. Defaults to None . None presence_penalty Optional[float] the presence penalty to use for the generation. Defaults to None . None raw_prompting Optional[bool] a flag to use raw prompting for the generation. Defaults to None . None Returns: Type Description GenerateOutput The generated response from the Cohere API model. Source code in src/distilabel/models/llms/cohere.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n temperature: Optional[float] = None,\n max_tokens: Optional[int] = None,\n k: Optional[int] = None,\n p: Optional[float] = None,\n seed: Optional[float] = None,\n stop_sequences: Optional[Sequence[str]] = None,\n frequency_penalty: Optional[float] = None,\n presence_penalty: Optional[float] = None,\n raw_prompting: Optional[bool] = None,\n) -> GenerateOutput:\n \"\"\"Generates a response from the LLM given an input.\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: the temperature to use for the generation. Defaults to `None`.\n max_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `None`.\n k: the number of highest probability vocabulary tokens to keep for the generation.\n Defaults to `None`.\n p: the nucleus sampling probability to use for the generation. Defaults to\n `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: a list of sequences to use as stopping criteria for the generation.\n Defaults to `None`.\n frequency_penalty: the frequency penalty to use for the generation. Defaults\n to `None`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `None`.\n raw_prompting: a flag to use raw prompting for the generation. Defaults to\n `None`.\n\n Returns:\n The generated response from the Cohere API model.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n system, chat_history, message = self._format_chat_to_cohere(input)\n\n kwargs = {\n \"message\": message,\n \"model\": self.model,\n \"preamble\": system,\n \"chat_history\": chat_history,\n \"temperature\": temperature,\n \"max_tokens\": max_tokens,\n \"k\": k,\n \"p\": p,\n \"seed\": seed,\n \"stop_sequences\": stop_sequences,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"raw_prompting\": raw_prompting,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n response: Union[\"Message\", \"BaseModel\"] = await self._aclient.chat(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [response.model_dump_json()],\n **self._get_llm_statistics(\n input, orjson.dumps(response.model_dump_json()).decode(\"utf-8\")\n ), # type: ignore\n )\n\n if (text := response.text) == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response using Cohere client (model: '{self.model}').\"\n f\" Finish reason was: {response.finish_reason}\"\n )\n return prepare_output(\n [None],\n **self._get_llm_statistics(input, \"\"),\n )\n\n return prepare_output(\n [text],\n **self._get_llm_statistics(input, text),\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM","title":"GroqLLM ","text":" Bases: AsyncLLM Groq API implementation using the async client for concurrent text generation. Attributes: Name Type Description model str the name of the model from the Groq API to use for the generation. base_url Optional[RuntimeParameter[str]] the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. max_retries RuntimeParameter[int] the maximum number of times to retry the request to the API before failing. Defaults to 2 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. _aclient Optional[AsyncGroq] the AsyncGroq client from the groq package. Runtime parameters base_url : the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . api_key : the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 2 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . Examples: Generate text: from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import GroqLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = GroqLLM(\n model=\"llama3-70b-8192\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/groq.py class GroqLLM(AsyncLLM):\n \"\"\"Groq API implementation using the async client for concurrent text generation.\n\n Attributes:\n model: the name of the model from the Groq API to use for the generation.\n base_url: the base URL to use for the Groq API requests. Defaults to\n `\"https://api.groq.com\"`.\n api_key: the API key to authenticate the requests to the Groq API. Defaults to\n the value of the `GROQ_API_KEY` environment variable.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `2`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key.\n _aclient: the `AsyncGroq` client from the `groq` package.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the Groq API requests. Defaults to\n `\"https://api.groq.com\"`.\n - `api_key`: the API key to authenticate the requests to the Groq API. Defaults to\n the value of the `GROQ_API_KEY` environment variable.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `2`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import GroqLLM\n\n llm = GroqLLM(model=\"llama3-70b-8192\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import GroqLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = GroqLLM(\n model=\"llama3-70b-8192\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n _GROQ_API_BASE_URL_ENV_VAR_NAME, \"https://api.groq.com\"\n ),\n description=\"The base URL to use for the Groq API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_GROQ_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Groq API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=2,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(_GROQ_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncGroq\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from groq import AsyncGroq\n except ImportError as ie:\n raise ImportError(\n \"Groq Python client is not installed. Please install it using\"\n ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = AsyncGroq(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n seed: Optional[int] = None,\n max_new_tokens: int = 128,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[str] = None,\n ) -> \"GenerateOutput\":\n \"\"\"Generates `num_generations` responses for the given input using the Groq async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n seed: the seed to use for the generation. Defaults to `None`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: the stop sequence to use for the generation. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n References:\n - https://console.groq.com/docs/text-chat\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"seed\": seed,\n \"temperature\": temperature,\n \"max_tokens\": max_new_tokens,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n generations = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using the Groq client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n return prepare_output(generations, **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"ChatCompletion\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.prompt_tokens if completion else 0],\n \"output_tokens\": [completion.usage.completion_tokens if completion else 0],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.load","title":"load() ","text":"Loads the AsyncGroq client to benefit from async requests. Source code in src/distilabel/models/llms/groq.py def load(self) -> None:\n \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from groq import AsyncGroq\n except ImportError as ie:\n raise ImportError(\n \"Groq Python client is not installed. Please install it using\"\n ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = AsyncGroq(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.agenerate","title":"agenerate(input, seed=None, max_new_tokens=128, temperature=1.0, top_p=1.0, stop=None) async ","text":"Generates num_generations responses for the given input using the Groq async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required seed Optional[int] the seed to use for the generation. Defaults to None . None max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[str] the stop sequence to use for the generation. Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. References - https://console.groq.com/docs/text-chat
Source code in src/distilabel/models/llms/groq.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n seed: Optional[int] = None,\n max_new_tokens: int = 128,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[str] = None,\n) -> \"GenerateOutput\":\n \"\"\"Generates `num_generations` responses for the given input using the Groq async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n seed: the seed to use for the generation. Defaults to `None`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: the stop sequence to use for the generation. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n References:\n - https://console.groq.com/docs/text-chat\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"seed\": seed,\n \"temperature\": temperature,\n \"max_tokens\": max_new_tokens,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n generations = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using the Groq client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM","title":"InferenceEndpointsLLM ","text":" Bases: AsyncLLM , MagpieChatTemplateMixin InferenceEndpoints LLM implementation running the async API client. This LLM will internally use huggingface_hub.AsyncInferenceClient . Attributes: Name Type Description model_id Optional[str] the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None . endpoint_name Optional[RuntimeParameter[str]] the name of the Inference Endpoint to use for the LLM. Defaults to None . endpoint_namespace Optional[RuntimeParameter[str]] the namespace of the Inference Endpoint to use for the LLM. Defaults to None . base_url Optional[RuntimeParameter[str]] the base URL to use for the Inference Endpoints API requests. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Inference Endpoints API. tokenizer_id Optional[str] the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None , but defining one is recommended to properly format the prompt. model_display_name Optional[str] the model display name to use for the LLM. Defaults to None . use_magpie_template Optional[str] a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template Optional[str] the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . structured_output Optional[RuntimeParameter[StructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. Icon :hugging: Examples: Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Dedicated Inference Endpoints: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Dedicated Inference Endpoints or TGI: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py class InferenceEndpointsLLM(AsyncLLM, MagpieChatTemplateMixin):\n \"\"\"InferenceEndpoints LLM implementation running the async API client.\n\n This LLM will internally use `huggingface_hub.AsyncInferenceClient`.\n\n Attributes:\n model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which\n will be used to resolve the base URL for the serverless Inference Endpoints API requests.\n Defaults to `None`.\n endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to `None`.\n endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to `None`.\n base_url: the base URL to use for the Inference Endpoints API requests.\n api_key: the API key to authenticate the requests to the Inference Endpoints API.\n tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub.\n Defaults to `None`, but defining one is recommended to properly format the prompt.\n model_display_name: the model display name to use for the LLM. Defaults to `None`.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n structured_output: a dictionary containing the structured output configuration or\n if more fine-grained control is needed, an instance of `OutlinesStructuredOutput`.\n Defaults to None.\n\n Icon:\n `:hugging:`\n\n Examples:\n Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Dedicated Inference Endpoints:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Dedicated Inference Endpoints or TGI:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import InferenceEndpointsLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n ```\n \"\"\"\n\n model_id: Optional[str] = None\n\n endpoint_name: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The name of the Inference Endpoint to use for the LLM.\",\n )\n endpoint_namespace: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The namespace of the Inference Endpoint to use for the LLM.\",\n )\n base_url: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The base URL to use for the Inference Endpoints API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR),\n description=\"The API key to authenticate the requests to the Inference Endpoints API.\",\n )\n\n tokenizer_id: Optional[str] = None\n model_display_name: Optional[str] = None\n\n structured_output: Optional[RuntimeParameter[StructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _num_generations_param_supported = False\n\n _model_name: Optional[str] = PrivateAttr(default=None)\n _tokenizer: Optional[\"PreTrainedTokenizer\"] = PrivateAttr(default=None)\n _api_key_env_var: str = PrivateAttr(HF_TOKEN_ENV_VAR)\n _aclient: Optional[\"AsyncInferenceClient\"] = PrivateAttr(...)\n\n @model_validator(mode=\"after\") # type: ignore\n def only_one_of_model_id_endpoint_name_or_base_url_provided(\n self,\n ) -> \"InferenceEndpointsLLM\":\n \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n favour of the dynamically calculated one..\"\"\"\n\n if self.base_url and (self.model_id or self.endpoint_name):\n self._logger.warning( # type: ignore\n f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n \" or overwritten with the one generated from either of those args, for serverless\"\n \" or dedicated inference endpoints, respectively.\"\n )\n\n if self.use_magpie_template and self.tokenizer_id is None:\n raise ValueError(\n \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n \" set a `tokenizer_id` and try again.\"\n )\n\n if (\n self.model_id\n and self.tokenizer_id is None\n and self.structured_output is not None\n ):\n self.tokenizer_id = self.model_id\n\n if self.base_url and not (self.model_id or self.endpoint_name):\n return self\n\n if self.model_id and not self.endpoint_name:\n return self\n\n if self.endpoint_name and not self.model_id:\n return self\n\n raise ValidationError(\n f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n )\n\n def load(self) -> None: # noqa: C901\n \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n Endpoint.\n\n Raises:\n ImportError: if the `huggingface-hub` Python client is not installed.\n ValueError: if the model is not currently deployed or is not running the TGI framework.\n ImportError: if the `transformers` Python client is not installed.\n \"\"\"\n super().load()\n\n try:\n from huggingface_hub import (\n AsyncInferenceClient,\n InferenceClient,\n get_inference_endpoint,\n )\n except ImportError as ie:\n raise ImportError(\n \"Hugging Face Hub Python client is not installed. Please install it using\"\n \" `pip install huggingface-hub`.\"\n ) from ie\n\n if self.api_key is None:\n self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n if self.model_id is not None:\n client = InferenceClient(\n model=self.model_id, token=self.api_key.get_secret_value()\n )\n status = client.get_model_status()\n\n if (\n status.state not in {\"Loadable\", \"Loaded\"}\n and status.framework != \"text-generation-inference\"\n ):\n raise ValueError(\n f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n )\n\n self.base_url = client._resolve_url(\n model=self.model_id, task=\"text-generation\"\n )\n\n if self.endpoint_name is not None:\n client = get_inference_endpoint(\n name=self.endpoint_name,\n namespace=self.endpoint_namespace,\n token=self.api_key.get_secret_value(),\n )\n if client.status in [\"paused\", \"scaledToZero\"]:\n client.resume().wait(timeout=300)\n elif client.status == \"initializing\":\n client.wait(timeout=300)\n\n self.base_url = client.url\n self._model_name = client.repository\n\n self._aclient = AsyncInferenceClient(\n base_url=self.base_url,\n token=self.api_key.get_secret_value(),\n )\n\n if self.tokenizer_id:\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"Transformers Python client is not installed. Please install it using\"\n \" `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n\n @property\n @override\n def model_name(self) -> Union[str, None]: # type: ignore\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return (\n self.model_display_name\n or self._model_name\n or self.model_id\n or self.endpoint_name\n or self.base_url\n )\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n conversation=input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n def _get_structured_output(\n self, input: FormattedInput\n ) -> Tuple[\"StandardInput\", Union[Dict[str, Any], None]]:\n \"\"\"Gets the structured output (if any) for the given input.\n\n Args:\n input: a single input in chat format to generate responses for.\n\n Returns:\n The input and the structured output that will be passed as `grammar` to the\n inference endpoint or `None` if not required.\n \"\"\"\n structured_output = None\n\n # Specific structured output per input\n if isinstance(input, tuple):\n input, structured_output = input\n structured_output = {\n \"type\": structured_output[\"format\"], # type: ignore\n \"value\": structured_output[\"schema\"], # type: ignore\n }\n\n # Same structured output for all the inputs\n if structured_output is None and self.structured_output is not None:\n try:\n structured_output = {\n \"type\": self.structured_output[\"format\"], # type: ignore\n \"value\": self.structured_output[\"schema\"], # type: ignore\n }\n except KeyError as e:\n raise ValueError(\n \"To use the structured output you have to inform the `format` and `schema` in \"\n \"the `structured_output` attribute.\"\n ) from e\n\n if structured_output:\n if isinstance(structured_output[\"value\"], ModelMetaclass):\n structured_output[\"value\"] = structured_output[\n \"value\"\n ].model_json_schema()\n\n return input, structured_output\n\n async def _generate_with_text_generation(\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n repetition_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n temperature: float = 1.0,\n do_sample: bool = False,\n top_n_tokens: Optional[int] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n stop_sequences: Union[List[str], None] = None,\n return_full_text: bool = False,\n seed: Optional[int] = None,\n watermark: bool = False,\n ) -> GenerateOutput:\n input, structured_output = self._get_structured_output(input)\n prompt = self.prepare_input(input)\n generation: Union[\"TextGenerationOutput\", None] = None\n try:\n generation = await self._aclient.text_generation( # type: ignore\n prompt=prompt,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n # NOTE: here to ensure that the cache is not used and a different response is\n # generated every time\n seed=seed or random.randint(0, sys.maxsize),\n watermark=watermark,\n grammar=structured_output, # type: ignore\n details=True,\n )\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n return prepare_output(\n generations=[generation.generated_text] if generation else [None],\n input_tokens=[compute_tokens(prompt, self._tokenizer.encode)], # type: ignore\n output_tokens=[\n generation.details.generated_tokens\n if generation and generation.details\n else 0\n ],\n logprobs=self._get_logprobs_from_text_generation(generation)\n if generation\n else None, # type: ignore\n )\n\n def _get_logprobs_from_text_generation(\n self, generation: \"TextGenerationOutput\"\n ) -> Union[List[List[List[\"Logprob\"]]], None]:\n if generation.details is None or generation.details.top_tokens is None:\n return None\n\n return [\n [\n [\n {\"token\": top_logprob[\"text\"], \"logprob\": top_logprob[\"logprob\"]}\n for top_logprob in token_logprobs\n ]\n for token_logprobs in generation.details.top_tokens\n ]\n ]\n\n async def _generate_with_chat_completion(\n self,\n input: \"StandardInput\",\n max_new_tokens: int = 128,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[float] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n ) -> GenerateOutput:\n message = None\n completion: Union[\"ChatCompletionOutput\", None] = None\n output_logprobs = None\n try:\n completion = await self._aclient.chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n # NOTE: here to ensure that the cache is not used and a different response is\n # generated every time\n seed=seed or random.randint(0, sys.maxsize),\n stop=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice, # type: ignore\n tool_prompt=tool_prompt,\n tools=tools, # type: ignore\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n choice = completion.choices[0] # type: ignore\n if (message := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n output_logprobs = [choice_logprobs]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n return prepare_output(\n generations=[message],\n input_tokens=[completion.usage.prompt_tokens] if completion else None,\n output_tokens=[completion.usage.completion_tokens] if completion else None,\n logprobs=output_logprobs,\n )\n\n def _get_logprobs_from_choice(\n self, choice: \"ChatCompletionOutputComplete\"\n ) -> Union[List[List[\"Logprob\"]], None]:\n if choice.logprobs is None:\n return None\n\n return [\n [\n {\"token\": top_logprob.token, \"logprob\": top_logprob.logprob}\n for top_logprob in token_logprobs.top_logprobs\n ]\n for token_logprobs in choice.logprobs.content\n ]\n\n def _check_stop_sequences(\n self,\n stop_sequences: Optional[Union[str, List[str]]] = None,\n ) -> Union[List[str], None]:\n \"\"\"Checks that no more than 4 stop sequences are provided.\n\n Args:\n stop_sequences: the stop sequences to be checked.\n\n Returns:\n The stop sequences.\n \"\"\"\n if stop_sequences is not None:\n if isinstance(stop_sequences, str):\n stop_sequences = [stop_sequences]\n if len(stop_sequences) > 4:\n warnings.warn(\n \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n UserWarning,\n stacklevel=2,\n )\n stop_sequences = stop_sequences[:4]\n return stop_sequences\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_n_tokens: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n do_sample: bool = False,\n repetition_penalty: Optional[float] = None,\n return_full_text: bool = False,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n watermark: bool = False,\n ) -> GenerateOutput:\n \"\"\"Generates completions for the given input using the async client. This method\n uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n Some arguments of this function are specific to the `text_generation` method, while\n some others are specific to the `chat_completion` method.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on their existing frequency in the text so far, decreasing\n model's likelihood to repeat the same line verbatim. Defauls to `None`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`.\n Defaults to `None`.\n logprobs: whether to return the log probabilities or not. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defaults to `False`.\n presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on whether they appear in the text so far, increasing the\n model likelihood to talk about new topics. This argument is exclusive to\n the `chat_completion` method and will be used only if `tokenizer_id` is\n `None`. Defauls to `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: either a single string or a list of strings containing the sequences\n to stop the generation at. Defaults to `None`, but will be set to the\n `tokenizer.eos_token` if available.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n tool_choice: the name of the tool the model should call. It can be a dictionary\n like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n model won't use any tool. This argument is exclusive to the `chat_completion`\n method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defauls to `None`.\n tools: a list of tools definitions that the LLM can use.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`. Defaults to `None`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. This argument is exclusive to the `chat_completion` method and\n will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n top_n_tokens: the number of top log probabilities to return per output token\n generated. This argument is exclusive of the `text_generation` method and\n will be only used if `tokenizer_id` is not `None`. Defaults to `None`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n do_sample: whether to use sampling for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id` is not\n `None`. Defaults to `False`.\n repetition_penalty: the repetition penalty to use for the generation. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n return_full_text: whether to return the full text of the completion or just\n the generated text. Defaults to `False`, meaning that only the generated\n text will be returned. This argument is exclusive of the `text_generation`\n method and will be only used if `tokenizer_id` is not `None`.\n top_k: the top-k value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n values in TGI.\n typical_p: the typical-p value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n watermark: whether to add the watermark to the generated text. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n stop_sequences = self._check_stop_sequences(stop_sequences)\n\n if self.tokenizer_id is None:\n return await self._generate_with_chat_completion(\n input=input, # type: ignore\n max_new_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n seed=seed,\n stop_sequences=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice,\n tool_prompt=tool_prompt,\n tools=tools,\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n\n return await self._generate_with_text_generation(\n input=input,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n seed=seed,\n watermark=watermark,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.model_name","title":"model_name: Union[str, None] property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.only_one_of_model_id_endpoint_name_or_base_url_provided","title":"only_one_of_model_id_endpoint_name_or_base_url_provided() ","text":"Validates that only one of model_id or endpoint_name is provided; and if base_url is also provided, a warning will be shown informing the user that the provided base_url will be ignored in favour of the dynamically calculated one.. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py @model_validator(mode=\"after\") # type: ignore\ndef only_one_of_model_id_endpoint_name_or_base_url_provided(\n self,\n) -> \"InferenceEndpointsLLM\":\n \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n favour of the dynamically calculated one..\"\"\"\n\n if self.base_url and (self.model_id or self.endpoint_name):\n self._logger.warning( # type: ignore\n f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n \" or overwritten with the one generated from either of those args, for serverless\"\n \" or dedicated inference endpoints, respectively.\"\n )\n\n if self.use_magpie_template and self.tokenizer_id is None:\n raise ValueError(\n \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n \" set a `tokenizer_id` and try again.\"\n )\n\n if (\n self.model_id\n and self.tokenizer_id is None\n and self.structured_output is not None\n ):\n self.tokenizer_id = self.model_id\n\n if self.base_url and not (self.model_id or self.endpoint_name):\n return self\n\n if self.model_id and not self.endpoint_name:\n return self\n\n if self.endpoint_name and not self.model_id:\n return self\n\n raise ValidationError(\n f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.load","title":"load() ","text":"Loads the AsyncInferenceClient client to connect to the Hugging Face Inference Endpoint. Raises: Type Description ImportError if the huggingface-hub Python client is not installed. ValueError if the model is not currently deployed or is not running the TGI framework. ImportError if the transformers Python client is not installed. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def load(self) -> None: # noqa: C901\n \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n Endpoint.\n\n Raises:\n ImportError: if the `huggingface-hub` Python client is not installed.\n ValueError: if the model is not currently deployed or is not running the TGI framework.\n ImportError: if the `transformers` Python client is not installed.\n \"\"\"\n super().load()\n\n try:\n from huggingface_hub import (\n AsyncInferenceClient,\n InferenceClient,\n get_inference_endpoint,\n )\n except ImportError as ie:\n raise ImportError(\n \"Hugging Face Hub Python client is not installed. Please install it using\"\n \" `pip install huggingface-hub`.\"\n ) from ie\n\n if self.api_key is None:\n self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n if self.model_id is not None:\n client = InferenceClient(\n model=self.model_id, token=self.api_key.get_secret_value()\n )\n status = client.get_model_status()\n\n if (\n status.state not in {\"Loadable\", \"Loaded\"}\n and status.framework != \"text-generation-inference\"\n ):\n raise ValueError(\n f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n )\n\n self.base_url = client._resolve_url(\n model=self.model_id, task=\"text-generation\"\n )\n\n if self.endpoint_name is not None:\n client = get_inference_endpoint(\n name=self.endpoint_name,\n namespace=self.endpoint_namespace,\n token=self.api_key.get_secret_value(),\n )\n if client.status in [\"paused\", \"scaledToZero\"]:\n client.resume().wait(timeout=300)\n elif client.status == \"initializing\":\n client.wait(timeout=300)\n\n self.base_url = client.url\n self._model_name = client.repository\n\n self._aclient = AsyncInferenceClient(\n base_url=self.base_url,\n token=self.api_key.get_secret_value(),\n )\n\n if self.tokenizer_id:\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"Transformers Python client is not installed. Please install it using\"\n \" `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n conversation=input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._get_structured_output","title":"_get_structured_output(input) ","text":"Gets the structured output (if any) for the given input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required Returns: Type Description StandardInput The input and the structured output that will be passed as grammar to the Union[Dict[str, Any], None] inference endpoint or None if not required. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def _get_structured_output(\n self, input: FormattedInput\n) -> Tuple[\"StandardInput\", Union[Dict[str, Any], None]]:\n \"\"\"Gets the structured output (if any) for the given input.\n\n Args:\n input: a single input in chat format to generate responses for.\n\n Returns:\n The input and the structured output that will be passed as `grammar` to the\n inference endpoint or `None` if not required.\n \"\"\"\n structured_output = None\n\n # Specific structured output per input\n if isinstance(input, tuple):\n input, structured_output = input\n structured_output = {\n \"type\": structured_output[\"format\"], # type: ignore\n \"value\": structured_output[\"schema\"], # type: ignore\n }\n\n # Same structured output for all the inputs\n if structured_output is None and self.structured_output is not None:\n try:\n structured_output = {\n \"type\": self.structured_output[\"format\"], # type: ignore\n \"value\": self.structured_output[\"schema\"], # type: ignore\n }\n except KeyError as e:\n raise ValueError(\n \"To use the structured output you have to inform the `format` and `schema` in \"\n \"the `structured_output` attribute.\"\n ) from e\n\n if structured_output:\n if isinstance(structured_output[\"value\"], ModelMetaclass):\n structured_output[\"value\"] = structured_output[\n \"value\"\n ].model_json_schema()\n\n return input, structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._check_stop_sequences","title":"_check_stop_sequences(stop_sequences=None) ","text":"Checks that no more than 4 stop sequences are provided. Parameters: Name Type Description Default stop_sequences Optional[Union[str, List[str]]] the stop sequences to be checked. None Returns: Type Description Union[List[str], None] The stop sequences. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def _check_stop_sequences(\n self,\n stop_sequences: Optional[Union[str, List[str]]] = None,\n) -> Union[List[str], None]:\n \"\"\"Checks that no more than 4 stop sequences are provided.\n\n Args:\n stop_sequences: the stop sequences to be checked.\n\n Returns:\n The stop sequences.\n \"\"\"\n if stop_sequences is not None:\n if isinstance(stop_sequences, str):\n stop_sequences = [stop_sequences]\n if len(stop_sequences) > 4:\n warnings.warn(\n \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n UserWarning,\n stacklevel=2,\n )\n stop_sequences = stop_sequences[:4]\n return stop_sequences\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.agenerate","title":"agenerate(input, max_new_tokens=128, frequency_penalty=None, logit_bias=None, logprobs=False, presence_penalty=None, seed=None, stop_sequences=None, temperature=1.0, tool_choice=None, tool_prompt=None, tools=None, top_logprobs=None, top_n_tokens=None, top_p=None, do_sample=False, repetition_penalty=None, return_full_text=False, top_k=None, typical_p=None, watermark=False) async ","text":"Generates completions for the given input using the async client. This method uses two methods of the huggingface_hub.AsyncClient : chat_completion and text_generation . chat_completion method will be used only if no tokenizer_id has been specified. Some arguments of this function are specific to the text_generation method, while some others are specific to the chat_completion method. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] a value between -2.0 and 2.0 . Positive values penalize new tokens based on their existing frequency in the text so far, decreasing model's likelihood to repeat the same line verbatim. Defauls to None . None logit_bias Optional[List[float]] modify the likelihood of specified tokens appearing in the completion. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None logprobs bool whether to return the log probabilities or not. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to False . False presence_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] a value between -2.0 and 2.0 . Positive values penalize new tokens based on whether they appear in the text so far, increasing the model likelihood to talk about new topics. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defauls to None . None seed Optional[int] the seed to use for the generation. Defaults to None . None stop_sequences Optional[List[str]] either a single string or a list of strings containing the sequences to stop the generation at. Defaults to None , but will be set to the tokenizer.eos_token if available. None temperature float the temperature to use for the generation. Defaults to 1.0 . 1.0 tool_choice Optional[Union[Dict[str, str], Literal['auto']]] the name of the tool the model should call. It can be a dictionary like {\"function_name\": \"my_tool\"} or \"auto\". If not provided, then the model won't use any tool. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None tool_prompt Optional[str] A prompt to be appended before the tools. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defauls to None . None tools Optional[List[Dict[str, Any]]] a list of tools definitions that the LLM can use. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None top_n_tokens Optional[PositiveInt] the number of top log probabilities to return per output token generated. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . None do_sample bool whether to use sampling for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to False . False repetition_penalty Optional[float] the repetition penalty to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None return_full_text bool whether to return the full text of the completion or just the generated text. Defaults to False , meaning that only the generated text will be returned. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . False top_k Optional[int] the top-k value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to 0.8 , since neither 0.0 nor 1.0 are valid values in TGI. None typical_p Optional[float] the typical-p value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None watermark bool whether to add the watermark to the generated text. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . False Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_n_tokens: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n do_sample: bool = False,\n repetition_penalty: Optional[float] = None,\n return_full_text: bool = False,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n watermark: bool = False,\n) -> GenerateOutput:\n \"\"\"Generates completions for the given input using the async client. This method\n uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n Some arguments of this function are specific to the `text_generation` method, while\n some others are specific to the `chat_completion` method.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on their existing frequency in the text so far, decreasing\n model's likelihood to repeat the same line verbatim. Defauls to `None`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`.\n Defaults to `None`.\n logprobs: whether to return the log probabilities or not. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defaults to `False`.\n presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on whether they appear in the text so far, increasing the\n model likelihood to talk about new topics. This argument is exclusive to\n the `chat_completion` method and will be used only if `tokenizer_id` is\n `None`. Defauls to `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: either a single string or a list of strings containing the sequences\n to stop the generation at. Defaults to `None`, but will be set to the\n `tokenizer.eos_token` if available.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n tool_choice: the name of the tool the model should call. It can be a dictionary\n like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n model won't use any tool. This argument is exclusive to the `chat_completion`\n method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defauls to `None`.\n tools: a list of tools definitions that the LLM can use.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`. Defaults to `None`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. This argument is exclusive to the `chat_completion` method and\n will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n top_n_tokens: the number of top log probabilities to return per output token\n generated. This argument is exclusive of the `text_generation` method and\n will be only used if `tokenizer_id` is not `None`. Defaults to `None`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n do_sample: whether to use sampling for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id` is not\n `None`. Defaults to `False`.\n repetition_penalty: the repetition penalty to use for the generation. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n return_full_text: whether to return the full text of the completion or just\n the generated text. Defaults to `False`, meaning that only the generated\n text will be returned. This argument is exclusive of the `text_generation`\n method and will be only used if `tokenizer_id` is not `None`.\n top_k: the top-k value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n values in TGI.\n typical_p: the typical-p value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n watermark: whether to add the watermark to the generated text. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n stop_sequences = self._check_stop_sequences(stop_sequences)\n\n if self.tokenizer_id is None:\n return await self._generate_with_chat_completion(\n input=input, # type: ignore\n max_new_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n seed=seed,\n stop_sequences=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice,\n tool_prompt=tool_prompt,\n tools=tools,\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n\n return await self._generate_with_text_generation(\n input=input,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n seed=seed,\n watermark=watermark,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM","title":"TransformersLLM ","text":" Bases: LLM , MagpieChatTemplateMixin , CudaDevicePlacementMixin Hugging Face transformers library LLM implementation using the text generation pipeline. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. revision str if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . torch_dtype str the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . model_kwargs Optional[Dict[str, Any]] additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model. tokenizer Optional[str] the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None . use_fast bool whether to use a fast tokenizer or not. Defaults to True . chat_template Optional[str] a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . device Optional[Union[str, int]] the name or index of the device where the model will be loaded. Defaults to None . device_map Optional[Union[str, Dict[str, Any]]] a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . token Optional[SecretStr] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. use_magpie_template Optional[RuntimeParameter[OutlinesStructuredOutputType]] a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template Optional[RuntimeParameter[OutlinesStructuredOutputType]] the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . Icon :hugging: Examples: Generate text: from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/huggingface/transformers.py class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n \"\"\"Hugging Face `transformers` library LLM implementation using the text generation\n pipeline.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n Defaults to `\"auto\"`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n model_kwargs: additional dictionary of keyword arguments that will be passed to\n the `from_pretrained` method of the model.\n tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n the tokenizer config files. If not provided, the one associated to the `model`\n will be used. Defaults to `None`.\n use_fast: whether to use a fast tokenizer or not. Defaults to `True`.\n chat_template: a chat template that will be used to build the prompts before\n sending them to the model. If not provided, the chat template defined in the\n tokenizer config will be used. If not provided and the tokenizer doesn't have\n a chat template, then ChatML template will be used. Defaults to `None`.\n device: the name or index of the device where the model will be loaded. Defaults\n to `None`.\n device_map: a dictionary mapping each layer of the model to a device, or a mode\n like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n\n Icon:\n `:hugging:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import TransformersLLM\n\n llm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n revision: str = \"main\"\n torch_dtype: str = \"auto\"\n trust_remote_code: bool = False\n model_kwargs: Optional[Dict[str, Any]] = None\n tokenizer: Optional[str] = None\n use_fast: bool = True\n chat_template: Optional[str] = None\n device: Optional[Union[str, int]] = None\n device_map: Optional[Union[str, Dict[str, Any]]] = None\n token: Optional[SecretStr] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR)\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _pipeline: Optional[\"Pipeline\"] = PrivateAttr(...)\n _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)\n\n def load(self) -> None:\n \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n it will configure the tokenizer chat template.\"\"\"\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import pipeline\n except ImportError as ie:\n raise ImportError(\n \"Transformers is not installed. Please install it using `pip install transformers`.\"\n ) from ie\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._pipeline = pipeline(\n \"text-generation\",\n model=self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n model_kwargs=self.model_kwargs or {},\n tokenizer=self.tokenizer or self.model,\n use_fast=self.use_fast,\n device=self.device,\n device_map=self.device_map,\n token=token,\n return_full_text=False,\n )\n\n if self.chat_template is not None:\n self._pipeline.tokenizer.chat_template = self.chat_template # type: ignore\n\n if self._pipeline.tokenizer.pad_token is None: # type: ignore\n self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token # type: ignore\n\n if self.structured_output:\n self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n self.structured_output\n )\n\n super().load()\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._pipeline.tokenizer.chat_template: # type: ignore\n return input[0][\"content\"]\n\n prompt: str = (\n self._pipeline.tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n @validate_call\n def generate( # type: ignore\n self,\n inputs: List[StandardInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n temperature: float = 0.1,\n repetition_penalty: float = 1.1,\n top_p: float = 1.0,\n top_k: int = 0,\n do_sample: bool = True,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input using the text generation\n pipeline.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n repetition_penalty: the repetition penalty to use for the generation. Defaults\n to `1.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n do_sample: whether to use sampling or not. Defaults to `True`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n outputs: List[List[Dict[str, str]]] = self._pipeline( # type: ignore\n prepared_inputs,\n max_new_tokens=max_new_tokens,\n temperature=temperature,\n repetition_penalty=repetition_penalty,\n top_p=top_p,\n top_k=top_k,\n do_sample=do_sample,\n num_return_sequences=num_generations,\n prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n pad_token_id=self._pipeline.tokenizer.eos_token_id, # type: ignore\n )\n llm_output = [\n [generation[\"generated_text\"] for generation in output]\n for output in outputs\n ]\n\n result = []\n for input, output in zip(inputs, llm_output):\n result.append(\n prepare_output(\n output,\n input_tokens=[\n compute_tokens(input, self._pipeline.tokenizer.encode)\n ],\n output_tokens=[\n compute_tokens(row, self._pipeline.tokenizer.encode)\n for row in output\n ],\n )\n )\n\n return result\n\n def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n ) -> List[\"HiddenState\"]:\n \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n execute the task head.\n\n Args:\n inputs: a list of inputs in chat format to generate the embeddings for.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n model: \"PreTrainedModel\" = (\n self._pipeline.model.model # type: ignore\n if hasattr(self._pipeline.model, \"model\") # type: ignore\n else next(self._pipeline.model.children()) # type: ignore\n )\n tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer # type: ignore\n input_ids = tokenizer(\n [self.prepare_input(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n ).to(model.device)\n last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n return [\n seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n for seq_last_hidden_state, attention_mask in zip(\n last_hidden_states,\n input_ids[\"attention_mask\"], # type: ignore\n )\n ]\n\n def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n ) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(\n structured_output, \"transformers\", self._pipeline\n )\n if schema := result.get(\"schema\"):\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.load","title":"load() ","text":"Loads the model and tokenizer and creates the text generation pipeline. In addition, it will configure the tokenizer chat template. Source code in src/distilabel/models/llms/huggingface/transformers.py def load(self) -> None:\n \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n it will configure the tokenizer chat template.\"\"\"\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import pipeline\n except ImportError as ie:\n raise ImportError(\n \"Transformers is not installed. Please install it using `pip install transformers`.\"\n ) from ie\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._pipeline = pipeline(\n \"text-generation\",\n model=self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n model_kwargs=self.model_kwargs or {},\n tokenizer=self.tokenizer or self.model,\n use_fast=self.use_fast,\n device=self.device,\n device_map=self.device_map,\n token=token,\n return_full_text=False,\n )\n\n if self.chat_template is not None:\n self._pipeline.tokenizer.chat_template = self.chat_template # type: ignore\n\n if self._pipeline.tokenizer.pad_token is None: # type: ignore\n self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token # type: ignore\n\n if self.structured_output:\n self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n self.structured_output\n )\n\n super().load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/llms/huggingface/transformers.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/huggingface/transformers.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._pipeline.tokenizer.chat_template: # type: ignore\n return input[0][\"content\"]\n\n prompt: str = (\n self._pipeline.tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, temperature=0.1, repetition_penalty=1.1, top_p=1.0, top_k=0, do_sample=True) ","text":"Generates num_generations responses for each input using the text generation pipeline. Parameters: Name Type Description Default inputs List[StandardInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 temperature float the temperature to use for the generation. Defaults to 0.1 . 0.1 repetition_penalty float the repetition penalty to use for the generation. Defaults to 1.1 . 1.1 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 top_k int the top-k value to use for the generation. Defaults to 0 . 0 do_sample bool whether to use sampling or not. Defaults to True . True Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/huggingface/transformers.py @validate_call\ndef generate( # type: ignore\n self,\n inputs: List[StandardInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n temperature: float = 0.1,\n repetition_penalty: float = 1.1,\n top_p: float = 1.0,\n top_k: int = 0,\n do_sample: bool = True,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input using the text generation\n pipeline.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n repetition_penalty: the repetition penalty to use for the generation. Defaults\n to `1.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n do_sample: whether to use sampling or not. Defaults to `True`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n outputs: List[List[Dict[str, str]]] = self._pipeline( # type: ignore\n prepared_inputs,\n max_new_tokens=max_new_tokens,\n temperature=temperature,\n repetition_penalty=repetition_penalty,\n top_p=top_p,\n top_k=top_k,\n do_sample=do_sample,\n num_return_sequences=num_generations,\n prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n pad_token_id=self._pipeline.tokenizer.eos_token_id, # type: ignore\n )\n llm_output = [\n [generation[\"generated_text\"] for generation in output]\n for output in outputs\n ]\n\n result = []\n for input, output in zip(inputs, llm_output):\n result.append(\n prepare_output(\n output,\n input_tokens=[\n compute_tokens(input, self._pipeline.tokenizer.encode)\n ],\n output_tokens=[\n compute_tokens(row, self._pipeline.tokenizer.encode)\n for row in output\n ],\n )\n )\n\n return result\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.get_last_hidden_states","title":"get_last_hidden_states(inputs) ","text":"Gets the last hidden_states of the model for the given inputs. It doesn't execute the task head. Parameters: Name Type Description Default inputs List[StandardInput] a list of inputs in chat format to generate the embeddings for. required Returns: Type Description List[HiddenState] A list containing the last hidden state for each sequence using a NumPy array List[HiddenState] with shape [num_tokens, hidden_size]. Source code in src/distilabel/models/llms/huggingface/transformers.py def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n execute the task head.\n\n Args:\n inputs: a list of inputs in chat format to generate the embeddings for.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n model: \"PreTrainedModel\" = (\n self._pipeline.model.model # type: ignore\n if hasattr(self._pipeline.model, \"model\") # type: ignore\n else next(self._pipeline.model.children()) # type: ignore\n )\n tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer # type: ignore\n input_ids = tokenizer(\n [self.prepare_input(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n ).to(model.device)\n last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n return [\n seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n for seq_last_hidden_state, attention_mask in zip(\n last_hidden_states,\n input_ids[\"attention_mask\"], # type: ignore\n )\n ]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output Optional[OutlinesStructuredOutputType] the configuration dict to prepare the structured output. None Returns: Type Description Union[Callable, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/huggingface/transformers.py def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(\n structured_output, \"transformers\", self._pipeline\n )\n if schema := result.get(\"schema\"):\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM","title":"LiteLLM ","text":" Bases: AsyncLLM LiteLLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc. verbose RuntimeParameter[bool] whether to log the LiteLLM client's logs. Defaults to False . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . Runtime parameters verbose : whether to log the LiteLLM client's logs. Defaults to False . Examples: Generate text: from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import LiteLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LiteLLM(\n model=\"gpt-3.5-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/litellm.py class LiteLLM(AsyncLLM):\n \"\"\"LiteLLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\",\n etc.\n verbose: whether to log the LiteLLM client's logs. Defaults to `False`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n Runtime parameters:\n - `verbose`: whether to log the LiteLLM client's logs. Defaults to `False`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import LiteLLM\n\n llm = LiteLLM(model=\"gpt-3.5-turbo\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import LiteLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = LiteLLM(\n model=\"gpt-3.5-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n verbose: RuntimeParameter[bool] = Field(\n default=False, description=\"Whether to log the LiteLLM client's logs.\"\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _aclient: Optional[Callable] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"\n Loads the `acompletion` LiteLLM client to benefit from async requests.\n \"\"\"\n super().load()\n\n try:\n import litellm\n\n litellm.telemetry = False\n except ImportError as e:\n raise ImportError(\n \"LiteLLM Python client is not installed. Please install it using\"\n \" `pip install litellm`.\"\n ) from e\n self._aclient = litellm.acompletion\n\n if not self.verbose:\n litellm.suppress_debug_info = True\n for key in logging.Logger.manager.loggerDict.keys():\n if \"litellm\" not in key.lower():\n continue\n logging.getLogger(key).setLevel(logging.CRITICAL)\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore # noqa: C901\n self,\n input: FormattedInput,\n num_generations: int = 1,\n functions: Optional[List] = None,\n function_call: Optional[str] = None,\n temperature: Optional[float] = 1.0,\n top_p: Optional[float] = 1.0,\n stop: Optional[Union[str, list]] = None,\n max_tokens: Optional[int] = None,\n presence_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[dict] = None,\n user: Optional[str] = None,\n metadata: Optional[dict] = None,\n api_base: Optional[str] = None,\n api_version: Optional[str] = None,\n api_key: Optional[str] = None,\n model_list: Optional[list] = None,\n mock_response: Optional[str] = None,\n force_timeout: Optional[int] = 600,\n custom_llm_provider: Optional[str] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n functions: a list of functions to apply to the conversation messages. Defaults to\n `None`.\n function_call: the name of the function to call within the conversation. Defaults\n to `None`.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n Defaults to `None`.\n max_tokens: The maximum number of tokens in the generated completion. Defaults to\n `None`.\n presence_penalty: It is used to penalize new tokens based on their existence in the\n text so far. Defaults to `None`.\n frequency_penalty: It is used to penalize new tokens based on their frequency in the\n text so far. Defaults to `None`.\n logit_bias: Used to modify the probability of specific tokens appearing in the\n completion. Defaults to `None`.\n user: A unique identifier representing your end-user. This can help the LLM provider\n to monitor and detect abuse. Defaults to `None`.\n metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n version, details, etc. Defaults to `None`.\n api_base: Base URL for the API. Defaults to `None`.\n api_version: API version. Defaults to `None`.\n api_key: API key. Defaults to `None`.\n model_list: List of api base, version, keys. Defaults to `None`.\n mock_response: If provided, return a mock completion response for testing or debugging\n purposes. Defaults to `None`.\n force_timeout: The maximum execution time in seconds for the completion request.\n Defaults to `600`.\n custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n import litellm\n from litellm import token_counter\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"model\": self.model,\n \"messages\": input,\n \"n\": num_generations,\n \"functions\": functions,\n \"function_call\": function_call,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n \"max_tokens\": max_tokens,\n \"presence_penalty\": presence_penalty,\n \"frequency_penalty\": frequency_penalty,\n \"logit_bias\": logit_bias,\n \"user\": user,\n \"metadata\": metadata,\n \"api_base\": api_base,\n \"api_version\": api_version,\n \"api_key\": api_key,\n \"model_list\": model_list,\n \"mock_response\": mock_response,\n \"force_timeout\": force_timeout,\n \"custom_llm_provider\": custom_llm_provider,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n choices = []\n while len(choices) < num_generations:\n completion = await self._aclient(**kwargs) # type: ignore\n if not self.structured_output:\n completion = completion.choices\n choices.extend(completion)\n return choices\n\n # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n try:\n litellm.drop_params = False\n choices = await _call_aclient_until_n_choices()\n except litellm.exceptions.APIError as e:\n if \"does not support parameters\" in str(e):\n litellm.drop_params = True\n choices = await _call_aclient_until_n_choices()\n else:\n raise e\n\n generations = []\n input_tokens = [\n token_counter(model=self.model, messages=input)\n ] * num_generations\n output_tokens = []\n\n if self.structured_output:\n for choice in choices:\n generations.append(choice.model_dump_json())\n output_tokens.append(\n token_counter(\n model=self.model,\n text=orjson.dumps(choice.model_dump_json()).decode(\"utf-8\"),\n )\n )\n return prepare_output(\n generations,\n input_tokens=input_tokens,\n output_tokens=output_tokens,\n )\n\n for choice in choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using LiteLLM client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n output_tokens.append(token_counter(model=self.model, text=content))\n\n return prepare_output(\n generations, input_tokens=input_tokens, output_tokens=output_tokens\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.load","title":"load() ","text":"Loads the acompletion LiteLLM client to benefit from async requests. Source code in src/distilabel/models/llms/litellm.py def load(self) -> None:\n \"\"\"\n Loads the `acompletion` LiteLLM client to benefit from async requests.\n \"\"\"\n super().load()\n\n try:\n import litellm\n\n litellm.telemetry = False\n except ImportError as e:\n raise ImportError(\n \"LiteLLM Python client is not installed. Please install it using\"\n \" `pip install litellm`.\"\n ) from e\n self._aclient = litellm.acompletion\n\n if not self.verbose:\n litellm.suppress_debug_info = True\n for key in logging.Logger.manager.loggerDict.keys():\n if \"litellm\" not in key.lower():\n continue\n logging.getLogger(key).setLevel(logging.CRITICAL)\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.agenerate","title":"agenerate(input, num_generations=1, functions=None, function_call=None, temperature=1.0, top_p=1.0, stop=None, max_tokens=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, user=None, metadata=None, api_base=None, api_version=None, api_key=None, model_list=None, mock_response=None, force_timeout=600, custom_llm_provider=None) async ","text":"Generates num_generations responses for the given input using the LiteLLM async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 functions Optional[List] a list of functions to apply to the conversation messages. Defaults to None . None function_call Optional[str] the name of the function to call within the conversation. Defaults to None . None temperature Optional[float] the temperature to use for the generation. Defaults to 1.0 . 1.0 top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, list]] Up to 4 sequences where the LLM API will stop generating further tokens. Defaults to None . None max_tokens Optional[int] The maximum number of tokens in the generated completion. Defaults to None . None presence_penalty Optional[float] It is used to penalize new tokens based on their existence in the text so far. Defaults to None . None frequency_penalty Optional[float] It is used to penalize new tokens based on their frequency in the text so far. Defaults to None . None logit_bias Optional[dict] Used to modify the probability of specific tokens appearing in the completion. Defaults to None . None user Optional[str] A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse. Defaults to None . None metadata Optional[dict] Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. Defaults to None . None api_base Optional[str] Base URL for the API. Defaults to None . None api_version Optional[str] API version. Defaults to None . None api_key Optional[str] API key. Defaults to None . None model_list Optional[list] List of api base, version, keys. Defaults to None . None mock_response Optional[str] If provided, return a mock completion response for testing or debugging purposes. Defaults to None . None force_timeout Optional[int] The maximum execution time in seconds for the completion request. Defaults to 600 . 600 custom_llm_provider Optional[str] Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable) model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/litellm.py @validate_call\nasync def agenerate( # type: ignore # noqa: C901\n self,\n input: FormattedInput,\n num_generations: int = 1,\n functions: Optional[List] = None,\n function_call: Optional[str] = None,\n temperature: Optional[float] = 1.0,\n top_p: Optional[float] = 1.0,\n stop: Optional[Union[str, list]] = None,\n max_tokens: Optional[int] = None,\n presence_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[dict] = None,\n user: Optional[str] = None,\n metadata: Optional[dict] = None,\n api_base: Optional[str] = None,\n api_version: Optional[str] = None,\n api_key: Optional[str] = None,\n model_list: Optional[list] = None,\n mock_response: Optional[str] = None,\n force_timeout: Optional[int] = 600,\n custom_llm_provider: Optional[str] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n functions: a list of functions to apply to the conversation messages. Defaults to\n `None`.\n function_call: the name of the function to call within the conversation. Defaults\n to `None`.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n Defaults to `None`.\n max_tokens: The maximum number of tokens in the generated completion. Defaults to\n `None`.\n presence_penalty: It is used to penalize new tokens based on their existence in the\n text so far. Defaults to `None`.\n frequency_penalty: It is used to penalize new tokens based on their frequency in the\n text so far. Defaults to `None`.\n logit_bias: Used to modify the probability of specific tokens appearing in the\n completion. Defaults to `None`.\n user: A unique identifier representing your end-user. This can help the LLM provider\n to monitor and detect abuse. Defaults to `None`.\n metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n version, details, etc. Defaults to `None`.\n api_base: Base URL for the API. Defaults to `None`.\n api_version: API version. Defaults to `None`.\n api_key: API key. Defaults to `None`.\n model_list: List of api base, version, keys. Defaults to `None`.\n mock_response: If provided, return a mock completion response for testing or debugging\n purposes. Defaults to `None`.\n force_timeout: The maximum execution time in seconds for the completion request.\n Defaults to `600`.\n custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n import litellm\n from litellm import token_counter\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"model\": self.model,\n \"messages\": input,\n \"n\": num_generations,\n \"functions\": functions,\n \"function_call\": function_call,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n \"max_tokens\": max_tokens,\n \"presence_penalty\": presence_penalty,\n \"frequency_penalty\": frequency_penalty,\n \"logit_bias\": logit_bias,\n \"user\": user,\n \"metadata\": metadata,\n \"api_base\": api_base,\n \"api_version\": api_version,\n \"api_key\": api_key,\n \"model_list\": model_list,\n \"mock_response\": mock_response,\n \"force_timeout\": force_timeout,\n \"custom_llm_provider\": custom_llm_provider,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n choices = []\n while len(choices) < num_generations:\n completion = await self._aclient(**kwargs) # type: ignore\n if not self.structured_output:\n completion = completion.choices\n choices.extend(completion)\n return choices\n\n # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n try:\n litellm.drop_params = False\n choices = await _call_aclient_until_n_choices()\n except litellm.exceptions.APIError as e:\n if \"does not support parameters\" in str(e):\n litellm.drop_params = True\n choices = await _call_aclient_until_n_choices()\n else:\n raise e\n\n generations = []\n input_tokens = [\n token_counter(model=self.model, messages=input)\n ] * num_generations\n output_tokens = []\n\n if self.structured_output:\n for choice in choices:\n generations.append(choice.model_dump_json())\n output_tokens.append(\n token_counter(\n model=self.model,\n text=orjson.dumps(choice.model_dump_json()).decode(\"utf-8\"),\n )\n )\n return prepare_output(\n generations,\n input_tokens=input_tokens,\n output_tokens=output_tokens,\n )\n\n for choice in choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using LiteLLM client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n output_tokens.append(token_counter(model=self.model, text=content))\n\n return prepare_output(\n generations, input_tokens=input_tokens, output_tokens=output_tokens\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM","title":"LlamaCppLLM ","text":" Bases: LLM llama.cpp LLM implementation running the Python bindings for the C++ code. Attributes: Name Type Description model_path RuntimeParameter[FilePath] contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings. n_gpu_layers RuntimeParameter[int] the number of layers to use for the GPU. Defaults to -1 , meaning that the available GPU device will be used. chat_format Optional[RuntimeParameter[str]] the chat format to use for the model. Defaults to None , which means the Llama format will be used. n_ctx int the context size to use for the model. Defaults to 512 . n_batch int the prompt processing maximum batch size to use for the model. Defaults to 512 . seed int random seed to use for the generation. Defaults to 4294967295 . verbose RuntimeParameter[bool] whether to print verbose output. Defaults to False . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . _model Optional[Llama] the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. Runtime parameters model_path : the path to the GGUF quantized model. n_gpu_layers : the number of layers to use for the GPU. Defaults to -1 . chat_format : the chat format to use for the model. Defaults to None . verbose : whether to print verbose output. Defaults to False . extra_kwargs : additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . References llama.cpp llama-cpp-python Examples: Generate text: from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/llamacpp.py class LlamaCppLLM(LLM):\n \"\"\"llama.cpp LLM implementation running the Python bindings for the C++ code.\n\n Attributes:\n model_path: contains the path to the GGUF quantized model, compatible with the\n installed version of the `llama.cpp` Python bindings.\n n_gpu_layers: the number of layers to use for the GPU. Defaults to `-1`, meaning that\n the available GPU device will be used.\n chat_format: the chat format to use for the model. Defaults to `None`, which means the\n Llama format will be used.\n n_ctx: the context size to use for the model. Defaults to `512`.\n n_batch: the prompt processing maximum batch size to use for the model. Defaults to `512`.\n seed: random seed to use for the generation. Defaults to `4294967295`.\n verbose: whether to print verbose output. Defaults to `False`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `Llama` class of `llama_cpp` library. Defaults to `{}`.\n _model: the Llama model instance. This attribute is meant to be used internally and\n should not be accessed directly. It will be set in the `load` method.\n\n Runtime parameters:\n - `model_path`: the path to the GGUF quantized model.\n - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.\n - `chat_format`: the chat format to use for the model. Defaults to `None`.\n - `verbose`: whether to print verbose output. Defaults to `False`.\n - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the\n `Llama` class of `llama_cpp` library. Defaults to `{}`.\n\n References:\n - [`llama.cpp`](https://github.com/ggerganov/llama.cpp)\n - [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python)\n\n Examples:\n Generate text:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import LlamaCppLLM\n\n # You can follow along this example downloading the following model running the following\n # command in the terminal, that will download the model to the `Downloads` folder:\n # curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\n model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import LlamaCppLLM\n\n model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model_path: RuntimeParameter[FilePath] = Field(\n default=None, description=\"The path to the GGUF quantized model.\", exclude=True\n )\n n_gpu_layers: RuntimeParameter[int] = Field(\n default=-1,\n description=\"The number of layers that will be loaded in the GPU.\",\n )\n chat_format: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The chat format to use for the model. Defaults to `None`, which means the Llama format will be used.\",\n )\n\n n_ctx: int = 512\n n_batch: int = 512\n seed: int = 4294967295\n verbose: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to print verbose output from llama.cpp library.\",\n )\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `Llama` class of `llama_cpp` library. See all the supported arguments at: \"\n \"https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__\",\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _logits_processor: Optional[\"LogitsProcessorList\"] = PrivateAttr(default=None)\n _model: Optional[\"Llama\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n try:\n from llama_cpp import Llama\n except ImportError as ie:\n raise ImportError(\n \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n ) from ie\n\n self._model = Llama(\n model_path=self.model_path.as_posix(), # type: ignore\n seed=self.seed,\n n_ctx=self.n_ctx,\n n_batch=self.n_batch,\n chat_format=self.chat_format,\n n_gpu_layers=self.n_gpu_layers,\n verbose=self.verbose,\n **self.extra_kwargs,\n )\n\n if self.structured_output:\n self._logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n # out of the model name, which won't be available until the `Llama` instance is created.\n super().load()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self._model.model_path # type: ignore\n\n @validate_call\n def generate( # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n extra_generation_kwargs: dictionary with additional arguments to be passed to\n the `create_chat_completion` method. Reference at\n https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n batch_outputs = []\n for input in inputs:\n if isinstance(input, tuple):\n input, structured_output = input\n elif self.structured_output:\n structured_output = self.structured_output\n\n outputs = []\n output_tokens = []\n for _ in range(num_generations):\n # NOTE(plaguss): There seems to be a bug in how the logits processor\n # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n # after each generation, so subsequent calls yield nothing. This is a workaround\n # until is fixed in the `llama_cpp` or `outlines` libraries.\n if structured_output:\n self._logits_processor = self._prepare_structured_output(\n structured_output\n )\n chat_completions: \"CreateChatCompletionResponse\" = (\n self._model.create_chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n logits_processor=self._logits_processor,\n **(extra_generation_kwargs or {}),\n )\n )\n outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n output_tokens.append(chat_completions[\"usage\"][\"completion_tokens\"])\n batch_outputs.append(\n prepare_output(\n outputs,\n input_tokens=[chat_completions[\"usage\"][\"prompt_tokens\"]]\n * num_generations,\n output_tokens=output_tokens,\n )\n )\n\n return batch_outputs\n\n def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n ) -> Union[\"LogitsProcessorList\", None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.load","title":"load() ","text":"Loads the Llama model from the model_path . Source code in src/distilabel/models/llms/llamacpp.py def load(self) -> None:\n \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n try:\n from llama_cpp import Llama\n except ImportError as ie:\n raise ImportError(\n \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n ) from ie\n\n self._model = Llama(\n model_path=self.model_path.as_posix(), # type: ignore\n seed=self.seed,\n n_ctx=self.n_ctx,\n n_batch=self.n_batch,\n chat_format=self.chat_format,\n n_gpu_layers=self.n_gpu_layers,\n verbose=self.verbose,\n **self.extra_kwargs,\n )\n\n if self.structured_output:\n self._logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n # out of the model name, which won't be available until the `Llama` instance is created.\n super().load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, extra_generation_kwargs=None) ","text":"Generates num_generations responses for the given input using the Llama model. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 extra_generation_kwargs Optional[Dict[str, Any]] dictionary with additional arguments to be passed to the create_chat_completion method. Reference at https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/llamacpp.py @validate_call\ndef generate( # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n extra_generation_kwargs: dictionary with additional arguments to be passed to\n the `create_chat_completion` method. Reference at\n https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n batch_outputs = []\n for input in inputs:\n if isinstance(input, tuple):\n input, structured_output = input\n elif self.structured_output:\n structured_output = self.structured_output\n\n outputs = []\n output_tokens = []\n for _ in range(num_generations):\n # NOTE(plaguss): There seems to be a bug in how the logits processor\n # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n # after each generation, so subsequent calls yield nothing. This is a workaround\n # until is fixed in the `llama_cpp` or `outlines` libraries.\n if structured_output:\n self._logits_processor = self._prepare_structured_output(\n structured_output\n )\n chat_completions: \"CreateChatCompletionResponse\" = (\n self._model.create_chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n logits_processor=self._logits_processor,\n **(extra_generation_kwargs or {}),\n )\n )\n outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n output_tokens.append(chat_completions[\"usage\"][\"completion_tokens\"])\n batch_outputs.append(\n prepare_output(\n outputs,\n input_tokens=[chat_completions[\"usage\"][\"prompt_tokens\"]]\n * num_generations,\n output_tokens=output_tokens,\n )\n )\n\n return batch_outputs\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output Optional[OutlinesStructuredOutputType] the configuration dict to prepare the structured output. None Returns: Type Description Union[LogitsProcessorList, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/llamacpp.py def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[\"LogitsProcessorList\", None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM","title":"MistralLLM ","text":" Bases: AsyncLLM Mistral LLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc. endpoint str the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\". api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. max_retries RuntimeParameter[int] the maximum number of retries to attempt when a request fails. Defaults to 5 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response. Defaults to 120 . max_concurrent_requests RuntimeParameter[int] the maximum number of concurrent requests to send. Defaults to 64 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. _aclient Optional[Mistral] the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method. Runtime parameters api_key : the API key to authenticate the requests to the Mistral API. max_retries : the maximum number of retries to attempt when a request fails. Defaults to 5 . timeout : the maximum time in seconds to wait for a response. Defaults to 120 . max_concurrent_requests : the maximum number of concurrent requests to send. Defaults to 64 . Examples: Generate text: from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import MistralLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = MistralLLM(\n model=\"open-mixtral-8x22b\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/mistral.py class MistralLLM(AsyncLLM):\n \"\"\"Mistral LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.\n endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".\n api_key: the API key to authenticate the requests to the Mistral API. Defaults to `None` which\n means that the value set for the environment variable `OPENAI_API_KEY` will be used, or\n `None` if not set.\n max_retries: the maximum number of retries to attempt when a request fails. Defaults to `5`.\n timeout: the maximum time in seconds to wait for a response. Defaults to `120`.\n max_concurrent_requests: the maximum number of concurrent requests to send. Defaults\n to `64`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key. It is meant to\n be used internally.\n _aclient: the `Mistral` to use for the Mistral API. It is meant to be used internally.\n Set in the `load` method.\n\n Runtime parameters:\n - `api_key`: the API key to authenticate the requests to the Mistral API.\n - `max_retries`: the maximum number of retries to attempt when a request fails.\n Defaults to `5`.\n - `timeout`: the maximum time in seconds to wait for a response. Defaults to `120`.\n - `max_concurrent_requests`: the maximum number of concurrent requests to send.\n Defaults to `64`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import MistralLLM\n\n llm = MistralLLM(model=\"open-mixtral-8x22b\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import MistralLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = MistralLLM(\n model=\"open-mixtral-8x22b\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n endpoint: str = \"https://api.mistral.ai\"\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_MISTRALAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Mistral API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n max_concurrent_requests: RuntimeParameter[int] = Field(\n default=64, description=\"The maximum number of concurrent requests to send.\"\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(_MISTRALAI_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"Mistral\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from mistralai import Mistral\n except ImportError as ie:\n raise ImportError(\n \"MistralAI Python client is not installed. Please install it using\"\n \" `pip install mistralai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = Mistral(\n api_key=self.api_key.get_secret_value(),\n endpoint=self.endpoint,\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout, # type: ignore\n max_concurrent_requests=self.max_concurrent_requests, # type: ignore\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n # TODO: add `num_generations` parameter once Mistral client allows `n` parameter\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: Optional[int] = None,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"max_tokens\": max_new_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n generations = []\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n # We need to check instructor and see if we can create a PR.\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n else:\n # completion = await self._aclient.chat(**kwargs) # type: ignore\n completion = await self._aclient.chat.complete_async(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using MistralAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"ChatCompletionResponse\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.prompt_tokens],\n \"output_tokens\": [completion.usage.completion_tokens],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.load","title":"load() ","text":"Loads the Mistral client to benefit from async requests. Source code in src/distilabel/models/llms/mistral.py def load(self) -> None:\n \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from mistralai import Mistral\n except ImportError as ie:\n raise ImportError(\n \"MistralAI Python client is not installed. Please install it using\"\n \" `pip install mistralai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = Mistral(\n api_key=self.api_key.get_secret_value(),\n endpoint=self.endpoint,\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout, # type: ignore\n max_concurrent_requests=self.max_concurrent_requests, # type: ignore\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.agenerate","title":"agenerate(input, max_new_tokens=None, temperature=None, top_p=None) async ","text":"Generates num_generations responses for the given input using the MistralAI async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_new_tokens Optional[int] the maximum number of new tokens that the model will generate. Defaults to 128 . None temperature Optional[float] the temperature to use for the generation. Defaults to 0.1 . None top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/mistral.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: Optional[int] = None,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"max_tokens\": max_new_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n generations = []\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n # We need to check instructor and see if we can create a PR.\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n else:\n # completion = await self._aclient.chat(**kwargs) # type: ignore\n completion = await self._aclient.chat.complete_async(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using MistralAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM","title":"MixtureOfAgentsLLM ","text":" Bases: AsyncLLM Mixture-of-Agents implementation. An LLM class that leverages LLM s collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLM s proposing/generating outputs that LLM s from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response. Attributes: Name Type Description aggregator_llm LLM The LLM that aggregates the outputs of the proposer LLM s. proposers_llms List[AsyncLLM] The list of LLM s that propose outputs to be aggregated. rounds int The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1 . References - Mixture-of-Agents Enhances Large Language Model Capabilities
Examples: Generate text: from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n)\n Source code in src/distilabel/models/llms/moa.py class MixtureOfAgentsLLM(AsyncLLM):\n \"\"\"`Mixture-of-Agents` implementation.\n\n An `LLM` class that leverages `LLM`s collective strenghts to generate a response,\n as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\"\n paper. There is a list of `LLM`s proposing/generating outputs that `LLM`s from the next\n round/layer can use as auxiliary information. Finally, there is an `LLM` that aggregates\n the outputs to generate the final response.\n\n Attributes:\n aggregator_llm: The `LLM` that aggregates the outputs of the proposer `LLM`s.\n proposers_llms: The list of `LLM`s that propose outputs to be aggregated.\n rounds: The number of layers or rounds that the `proposers_llms` will generate\n outputs. Defaults to `1`.\n\n References:\n - [Mixture-of-Agents Enhances Large Language Model Capabilities](https://arxiv.org/abs/2406.04692)\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\n llm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n )\n\n llm.load()\n\n output = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n )\n ```\n \"\"\"\n\n aggregator_llm: LLM\n proposers_llms: List[AsyncLLM] = Field(default_factory=list)\n rounds: int = 1\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns the runtime parameters of the `LLM`, which are a combination of the\n `RuntimeParameter`s of the `LLM`, the `aggregator_llm` and the `proposers_llms`.\n\n Returns:\n The runtime parameters of the `LLM`.\n \"\"\"\n runtime_parameters_names = super().runtime_parameters_names\n del runtime_parameters_names[\"generation_kwargs\"]\n return runtime_parameters_names\n\n def load(self) -> None:\n \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n super().load()\n\n for llm in self.proposers_llms:\n self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\") # type: ignore\n llm.load()\n\n self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\") # type: ignore\n self.aggregator_llm.load()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the aggregated model name.\"\"\"\n return f\"moa-{self.aggregator_llm.model_name}-{'-'.join([llm.model_name for llm in self.proposers_llms])}\"\n\n def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n Returns:\n The generation kwargs of the `MixtureOfAgents`.\n \"\"\"\n return {\n \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n \"proposers_llms\": [\n llm.get_generation_kwargs() for llm in self.proposers_llms\n ],\n }\n\n # `abstractmethod`, had to be implemented but not used\n async def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n ) -> List[Union[str, None]]:\n raise NotImplementedError(\n \"`agenerate` method is not implemented for `MixtureOfAgents`\"\n )\n\n def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n \"\"\"Builds the Mixture-of-Agents system prompt.\n\n Args:\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The Mixture-of-Agents system prompt.\n \"\"\"\n moa_system_prompt = MOA_SYSTEM_PROMPT\n for i, prev_output in enumerate(prev_outputs):\n if prev_output is not None:\n moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n return moa_system_prompt\n\n def _inject_moa_system_prompt(\n self, input: \"StandardInput\", prev_outputs: List[str]\n ) -> \"StandardInput\":\n \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n Args:\n input: The input to inject the system prompt into.\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The input with the Mixture-of-Agents system prompt injected.\n \"\"\"\n if len(prev_outputs) == 0:\n return input\n\n moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n system = next((item for item in input if item[\"role\"] == \"system\"), None)\n if system:\n original_system_prompt = system[\"content\"]\n system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n else:\n input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n return input\n\n async def _agenerate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n \"proposers_llms\", [{}] * len(self.proposers_llms)\n )\n\n prev_outputs = []\n for round in range(self.rounds):\n self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\") # type: ignore\n\n # Generate `num_generations` with each proposer LLM for each input\n tasks = [\n asyncio.create_task(\n llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in itertools.zip_longest(\n inputs, prev_outputs, fillvalue=[]\n )\n ],\n num_generations=1,\n **generation_kwargs,\n )\n )\n for llm, generation_kwargs in zip(\n self.proposers_llms, proposers_llms_kwargs\n )\n ]\n\n # Group generations per input\n outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n prev_outputs = [\n list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n ]\n\n self._logger.debug(\"Aggregating outputs in MoA\") # type: ignore\n if isinstance(self.aggregator_llm, AsyncLLM):\n return await self.aggregator_llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n\n return self.aggregator_llm.generate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns the runtime parameters of the LLM , which are a combination of the RuntimeParameter s of the LLM , the aggregator_llm and the proposers_llms . Returns: Type Description RuntimeParametersNames The runtime parameters of the LLM . "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.model_name","title":"model_name: str property ","text":"Returns the aggregated model name. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.load","title":"load() ","text":"Loads all the LLM s in the MixtureOfAgents . Source code in src/distilabel/models/llms/moa.py def load(self) -> None:\n \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n super().load()\n\n for llm in self.proposers_llms:\n self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\") # type: ignore\n llm.load()\n\n self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\") # type: ignore\n self.aggregator_llm.load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.get_generation_kwargs","title":"get_generation_kwargs() ","text":"Returns the generation kwargs of the MixtureOfAgents as a dictionary. Returns: Type Description Dict[str, Any] The generation kwargs of the MixtureOfAgents . Source code in src/distilabel/models/llms/moa.py def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n Returns:\n The generation kwargs of the `MixtureOfAgents`.\n \"\"\"\n return {\n \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n \"proposers_llms\": [\n llm.get_generation_kwargs() for llm in self.proposers_llms\n ],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._build_moa_system_prompt","title":"_build_moa_system_prompt(prev_outputs) ","text":"Builds the Mixture-of-Agents system prompt. Parameters: Name Type Description Default prev_outputs List[str] The list of previous outputs to use as references. required Returns: Type Description str The Mixture-of-Agents system prompt. Source code in src/distilabel/models/llms/moa.py def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n \"\"\"Builds the Mixture-of-Agents system prompt.\n\n Args:\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The Mixture-of-Agents system prompt.\n \"\"\"\n moa_system_prompt = MOA_SYSTEM_PROMPT\n for i, prev_output in enumerate(prev_outputs):\n if prev_output is not None:\n moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n return moa_system_prompt\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._inject_moa_system_prompt","title":"_inject_moa_system_prompt(input, prev_outputs) ","text":"Injects the Mixture-of-Agents system prompt into the input. Parameters: Name Type Description Default input StandardInput The input to inject the system prompt into. required prev_outputs List[str] The list of previous outputs to use as references. required Returns: Type Description StandardInput The input with the Mixture-of-Agents system prompt injected. Source code in src/distilabel/models/llms/moa.py def _inject_moa_system_prompt(\n self, input: \"StandardInput\", prev_outputs: List[str]\n) -> \"StandardInput\":\n \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n Args:\n input: The input to inject the system prompt into.\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The input with the Mixture-of-Agents system prompt injected.\n \"\"\"\n if len(prev_outputs) == 0:\n return input\n\n moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n system = next((item for item in input if item[\"role\"] == \"system\"), None)\n if system:\n original_system_prompt = system[\"content\"]\n system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n else:\n input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n return input\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._agenerate","title":"_agenerate(inputs, num_generations=1, **kwargs) async ","text":"Internal function to concurrently generate responses for a list of inputs. Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for. required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/moa.py async def _agenerate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n \"proposers_llms\", [{}] * len(self.proposers_llms)\n )\n\n prev_outputs = []\n for round in range(self.rounds):\n self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\") # type: ignore\n\n # Generate `num_generations` with each proposer LLM for each input\n tasks = [\n asyncio.create_task(\n llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in itertools.zip_longest(\n inputs, prev_outputs, fillvalue=[]\n )\n ],\n num_generations=1,\n **generation_kwargs,\n )\n )\n for llm, generation_kwargs in zip(\n self.proposers_llms, proposers_llms_kwargs\n )\n ]\n\n # Group generations per input\n outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n prev_outputs = [\n list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n ]\n\n self._logger.debug(\"Aggregating outputs in MoA\") # type: ignore\n if isinstance(self.aggregator_llm, AsyncLLM):\n return await self.aggregator_llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n\n return self.aggregator_llm.generate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM","title":"OllamaLLM ","text":" Bases: AsyncLLM Ollama LLM implementation running the Async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"notus\". host Optional[RuntimeParameter[str]] the Ollama server host. timeout RuntimeParameter[int] the timeout for the LLM. Defaults to 120 . _aclient Optional[AsyncClient] the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method. Runtime parameters host : the Ollama server host. timeout : the client timeout for the Ollama API. Defaults to 120 . Examples: Generate text: from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/ollama.py class OllamaLLM(AsyncLLM):\n \"\"\"Ollama LLM implementation running the Async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"notus\".\n host: the Ollama server host.\n timeout: the timeout for the LLM. Defaults to `120`.\n _aclient: the `AsyncClient` to use for the Ollama API. It is meant to be used internally.\n Set in the `load` method.\n\n Runtime parameters:\n - `host`: the Ollama server host.\n - `timeout`: the client timeout for the Ollama API. Defaults to `120`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import OllamaLLM\n\n llm = OllamaLLM(model=\"llama3\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n host: Optional[RuntimeParameter[str]] = Field(\n default=None, description=\"The host of the Ollama API.\"\n )\n timeout: RuntimeParameter[int] = Field(\n default=120, description=\"The timeout for the Ollama API.\"\n )\n follow_redirects: bool = True\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _aclient: Optional[\"AsyncClient\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n super().load()\n\n try:\n from ollama import AsyncClient\n\n self._aclient = AsyncClient(\n host=self.host,\n timeout=self.timeout,\n follow_redirects=self.follow_redirects,\n )\n except ImportError as e:\n raise ImportError(\n \"Ollama Python client is not installed. Please install it using\"\n \" `pip install ollama`.\"\n ) from e\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: StandardInput,\n format: Literal[\"\", \"json\"] = \"\",\n # TODO: include relevant options from `Options` in `agenerate` method.\n options: Union[Options, None] = None,\n keep_alive: Union[bool, None] = None,\n ) -> GenerateOutput:\n \"\"\"\n Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n Args:\n input: the input to use for the generation.\n format: the format to use for the generation. Defaults to `\"\"`.\n options: the options to use for the generation. Defaults to `None`.\n keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n Returns:\n A list of strings as completion for the given input.\n \"\"\"\n text = None\n try:\n completion: Dict[str, Any] = await self._aclient.chat( # type: ignore\n model=self.model,\n messages=input, # type: ignore\n stream=False,\n format=format,\n options=options,\n keep_alive=keep_alive,\n )\n text = completion[\"message\"][\"content\"]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n\n return prepare_output([text], **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: Dict[str, Any]) -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion[\"prompt_eval_count\"]],\n \"output_tokens\": [completion[\"eval_count\"]],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.load","title":"load() ","text":"Loads the AsyncClient to use Ollama async API. Source code in src/distilabel/models/llms/ollama.py def load(self) -> None:\n \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n super().load()\n\n try:\n from ollama import AsyncClient\n\n self._aclient = AsyncClient(\n host=self.host,\n timeout=self.timeout,\n follow_redirects=self.follow_redirects,\n )\n except ImportError as e:\n raise ImportError(\n \"Ollama Python client is not installed. Please install it using\"\n \" `pip install ollama`.\"\n ) from e\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.agenerate","title":"agenerate(input, format='', options=None, keep_alive=None) async ","text":"Generates a response asynchronously, using the Ollama Async API definition. Parameters: Name Type Description Default input StandardInput the input to use for the generation. required format Literal['', 'json'] the format to use for the generation. Defaults to \"\" . '' options Union[Options, None] the options to use for the generation. Defaults to None . None keep_alive Union[bool, None] whether to keep the connection alive. Defaults to None . None Returns: Type Description GenerateOutput A list of strings as completion for the given input. Source code in src/distilabel/models/llms/ollama.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: StandardInput,\n format: Literal[\"\", \"json\"] = \"\",\n # TODO: include relevant options from `Options` in `agenerate` method.\n options: Union[Options, None] = None,\n keep_alive: Union[bool, None] = None,\n) -> GenerateOutput:\n \"\"\"\n Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n Args:\n input: the input to use for the generation.\n format: the format to use for the generation. Defaults to `\"\"`.\n options: the options to use for the generation. Defaults to `None`.\n keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n Returns:\n A list of strings as completion for the given input.\n \"\"\"\n text = None\n try:\n completion: Dict[str, Any] = await self._aclient.chat( # type: ignore\n model=self.model,\n messages=input, # type: ignore\n stream=False,\n format=format,\n options=options,\n keep_alive=keep_alive,\n )\n text = completion[\"message\"][\"content\"]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n\n return prepare_output([text], **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM","title":"OpenAILLM ","text":" Bases: AsyncLLM OpenAI LLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here. base_url Optional[RuntimeParameter[str]] the base URL to use for the OpenAI API requests. Defaults to None , which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. max_retries RuntimeParameter[int] the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . Runtime parameters base_url : the base URL to use for the OpenAI API requests. Defaults to None . api_key : the API key to authenticate the requests to the OpenAI API. Defaults to None . max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . Icon :simple-openai: Examples: Generate text: from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate text from a custom endpoint following the OpenAI API: from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Generate with Batch API (offline batch generation): from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n Source code in src/distilabel/models/llms/openai.py class OpenAILLM(AsyncLLM):\n \"\"\"OpenAI LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc.\n Supported models can be found [here](https://platform.openai.com/docs/guides/text-generation).\n base_url: the base URL to use for the OpenAI API requests. Defaults to `None`, which\n means that the value set for the environment variable `OPENAI_BASE_URL` will\n be used, or \"https://api.openai.com/v1\" if not set.\n api_key: the API key to authenticate the requests to the OpenAI API. Defaults to\n `None` which means that the value set for the environment variable `OPENAI_API_KEY`\n will be used, or `None` if not set.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the OpenAI API requests. Defaults to `None`.\n - `api_key`: the API key to authenticate the requests to the OpenAI API. Defaults\n to `None`.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n\n Icon:\n `:simple-openai:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n llm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate text from a custom endpoint following the OpenAI API:\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n llm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import OpenAILLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n\n Generate with Batch API (offline batch generation):\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n load = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n # [['Hello! How can I assist you today?']]\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"OPENAI_BASE_URL\", \"https://api.openai.com/v1\"\n ),\n description=\"The base URL to use for the OpenAI API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_OPENAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the OpenAI API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _api_key_env_var: str = PrivateAttr(_OPENAI_API_KEY_ENV_VAR_NAME)\n _client: \"OpenAI\" = PrivateAttr(None)\n _aclient: \"AsyncOpenAI\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from openai import AsyncOpenAI, OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n self._aclient = AsyncOpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n def unload(self) -> None:\n \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n in case an exception is raised and has to be handled in the main process\"\"\"\n\n self._client = None # type: ignore\n self._aclient = None # type: ignore\n self.structured_output = None\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[Dict[str, str]] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to None\n which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n Note:\n If response_format\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n }\n # Check if it's a vision generation task, in that case \"stop\" cannot be used or raises\n # an error in the API.\n if isinstance(\n [row for row in input if row[\"role\"] == \"user\"][0][\"content\"], list\n ):\n kwargs.pop(\"stop\")\n\n if response_format is not None:\n kwargs[\"response_format\"] = response_format\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n\n if structured_output:\n # NOTE: `instructor` doesn't work with `n` parameter, so it will always return\n # only 1 choice.\n statistics = self._get_llm_statistics(completion._raw_response)\n if choice_logprobs := self._get_logprobs_from_choice(\n completion._raw_response.choices[0]\n ):\n output_logprobs = [choice_logprobs]\n else:\n output_logprobs = None\n return prepare_output(\n generations=[completion.model_dump_json()],\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=output_logprobs,\n )\n\n return self._generations_from_openai_completion(completion)\n\n def _generations_from_openai_completion(\n self, completion: \"OpenAIChatCompletion\"\n ) -> \"GenerateOutput\":\n \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n Args:\n completion: the completion object to get the generations from.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n generations = []\n logprobs = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using OpenAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n logprobs.append(choice_logprobs)\n\n statistics = self._get_llm_statistics(completion)\n return prepare_output(\n generations=generations,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=logprobs,\n )\n\n def _get_logprobs_from_choice(\n self, choice: \"OpenAIChoice\"\n ) -> Union[List[List[\"Logprob\"]], None]:\n if choice.logprobs is None or choice.logprobs.content is None:\n return None\n\n return [\n [\n {\"token\": top_logprob.token, \"logprob\": top_logprob.logprob}\n for top_logprob in token_logprobs.top_logprobs\n ]\n for token_logprobs in choice.logprobs.content\n ]\n\n def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[str] = None,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input\n in `inputs`.\n\n Raises:\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n ValueError: if no job IDs were found to retrieve the results from.\n \"\"\"\n if self.jobs_ids:\n return self._check_and_get_batch_results()\n\n if inputs:\n self.jobs_ids = self._create_jobs(\n inputs=inputs,\n **{\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n \"response_format\": response_format,\n },\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n\n def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n Batch API.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n Raises:\n ValueError: if no job IDs were found to retrieve the results from.\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n RuntimeError: if the only batch job found failed.\n \"\"\"\n if not self.jobs_ids:\n raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n outputs = []\n for batch_id in self.jobs_ids:\n batch = self._get_openai_batch(batch_id)\n\n if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n self._logger.error( # type: ignore\n f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n )\n if len(self.jobs_ids) == 1:\n self.jobs_ids = None\n raise RuntimeError(\n f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n f\" failed with status '{batch.status}'.\"\n )\n\n continue\n\n outputs.extend(self._retrieve_batch_results(batch))\n\n # sort by `custom_id` to return the results in the same order as the inputs\n outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n return [self._parse_output(output) for output in outputs]\n\n def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n Args:\n output: the output to parse.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n if \"response\" not in output:\n return []\n\n if output[\"response\"][\"status_code\"] != 200:\n return []\n\n return self._generations_from_openai_completion(\n OpenAIChatCompletion(**output[\"response\"][\"body\"])\n )\n\n def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n \"\"\"Gets a batch from the OpenAI Batch API.\n\n Args:\n batch_id: the ID of the batch to retrieve.\n\n Returns:\n The batch retrieved from the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while retrieving the batch from the\n OpenAI Batch API.\n \"\"\"\n import openai\n\n try:\n return self._client.batches.retrieve(batch_id)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n )\n raise e\n\n def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n into a list of dictionaries.\n\n Args:\n batch: the batch to retrieve the results from.\n\n Returns:\n A list of dictionaries containing the results of the batch.\n\n Raises:\n AssertionError: if no output file ID was found in the batch.\n \"\"\"\n import openai\n\n assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n try:\n file_response = self._client.files.content(batch.output_file_id)\n return [orjson.loads(line) for line in file_response.text.splitlines()]\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n )\n return []\n\n def _create_jobs(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> Tuple[str, ...]:\n \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A list of job IDs created in the OpenAI Batch API.\n \"\"\"\n batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n jobs = []\n for batch_input_file in batch_input_files:\n if batch := self._create_batch_api_job(batch_input_file):\n jobs.append(batch.id)\n return tuple(jobs)\n\n def _create_batch_api_job(\n self, batch_input_file: \"OpenAIFileObject\"\n ) -> Union[\"OpenAIBatch\", None]:\n \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n file.\n\n Args:\n batch_input_file: the input file to generate responses for.\n\n Returns:\n The batch job created in the OpenAI Batch API.\n \"\"\"\n import openai\n\n metadata = {\"description\": \"distilabel\"}\n\n if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n batch = None\n try:\n batch = self._client.batches.create(\n completion_window=\"24h\",\n endpoint=\"/v1/chat/completions\",\n input_file_id=batch_input_file.id,\n metadata=metadata,\n )\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI Batch API job for file with ID\"\n f\" '{batch_input_file.id}': {e}.\"\n )\n raise e\n return batch\n\n def _create_batch_files(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> List[\"OpenAIFileObject\"]:\n \"\"\"Creates the necessary input files for the batch API to generate responses. The\n maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n need to split the inputs into multiple files if necessary.\n\n More information: https://platform.openai.com/docs/api-reference/files/create\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n The list of file objects created for the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while creating the batch input file\n in the OpenAI Batch API.\n \"\"\"\n import openai\n\n files = []\n for file_no, buffer in enumerate(\n self._create_jsonl_buffers(inputs=inputs, **kwargs)\n ):\n try:\n # TODO: add distilabel pipeline name and id\n batch_input_file = self._client.files.create(\n file=(self._name_for_openai_files(file_no), buffer),\n purpose=\"batch\",\n )\n files.append(batch_input_file)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI batch input file: {e}\"\n )\n raise e\n return files\n\n def _create_jsonl_buffers(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> Generator[io.BytesIO, None, None]:\n \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Yields:\n A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n API.\n \"\"\"\n buffer = io.BytesIO()\n buffer_current_size = 0\n for i, input in enumerate(inputs):\n # We create the smallest `custom_id` so we don't increase the size of the file\n # to much, but we can still sort the results with the order of the inputs.\n row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n row_size = len(row)\n if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n buffer.seek(0)\n yield buffer\n buffer = io.BytesIO()\n buffer_current_size = 0\n buffer.write(row)\n buffer_current_size += row_size\n\n if buffer_current_size > 0:\n buffer.seek(0)\n yield buffer\n\n def _create_jsonl_row(\n self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n ) -> bytes:\n \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n Args:\n input: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n custom_id: a custom ID to use for the row.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A JSONL formatted row to be used by the OpenAI Batch API.\n \"\"\"\n # TODO: depending on the format of the input, add `response_format` to the kwargs\n row = {\n \"custom_id\": custom_id,\n \"method\": \"POST\",\n \"url\": \"/v1/chat/completions\",\n \"body\": {\"messages\": input, **kwargs},\n }\n json_row = orjson.dumps(row)\n return json_row + b\"\\n\"\n\n def _name_for_openai_files(self, file_no: int) -> str:\n if (\n envs.DISTILABEL_PIPELINE_NAME is None\n or envs.DISTILABEL_PIPELINE_CACHE_ID is None\n ):\n return f\"distilabel-pipeline-fileno-{file_no}.jsonl\"\n\n return f\"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl\"\n\n @staticmethod\n def _get_llm_statistics(\n completion: Union[\"OpenAIChatCompletion\", \"OpenAICompletion\"],\n ) -> \"LLMStatistics\":\n return {\n \"output_tokens\": [\n completion.usage.completion_tokens if completion.usage else 0\n ],\n \"input_tokens\": [completion.usage.prompt_tokens if completion.usage else 0],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.load","title":"load() ","text":"Loads the AsyncOpenAI client to benefit from async requests. Source code in src/distilabel/models/llms/openai.py def load(self) -> None:\n \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from openai import AsyncOpenAI, OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n self._aclient = AsyncOpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.unload","title":"unload() ","text":"Set clients to None as they both contain thread._RLock which cannot be pickled in case an exception is raised and has to be handled in the main process Source code in src/distilabel/models/llms/openai.py def unload(self) -> None:\n \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n in case an exception is raised and has to be handled in the main process\"\"\"\n\n self._client = None # type: ignore\n self._aclient = None # type: ignore\n self.structured_output = None\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, logprobs=False, top_logprobs=None, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None) async ","text":"Generates num_generations responses for the given input using the OpenAI async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 logprobs bool whether to return the log probabilities or not. Defaults to False . False top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. Defaults to None . None frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, List[str]]] a string or a list of strings to use as a stop sequence for the generation. Defaults to None . None response_format Optional[Dict[str, str]] the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to None which returns text. To return JSON, use {\"type\": \"json_object\"}. None Note If response_format Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/openai.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[Dict[str, str]] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to None\n which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n Note:\n If response_format\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n }\n # Check if it's a vision generation task, in that case \"stop\" cannot be used or raises\n # an error in the API.\n if isinstance(\n [row for row in input if row[\"role\"] == \"user\"][0][\"content\"], list\n ):\n kwargs.pop(\"stop\")\n\n if response_format is not None:\n kwargs[\"response_format\"] = response_format\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n\n if structured_output:\n # NOTE: `instructor` doesn't work with `n` parameter, so it will always return\n # only 1 choice.\n statistics = self._get_llm_statistics(completion._raw_response)\n if choice_logprobs := self._get_logprobs_from_choice(\n completion._raw_response.choices[0]\n ):\n output_logprobs = [choice_logprobs]\n else:\n output_logprobs = None\n return prepare_output(\n generations=[completion.model_dump_json()],\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=output_logprobs,\n )\n\n return self._generations_from_openai_completion(completion)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_openai_completion","title":"_generations_from_openai_completion(completion) ","text":"Get the generations from the OpenAI Chat Completion object. Parameters: Name Type Description Default completion ChatCompletion the completion object to get the generations from. required Returns: Type Description GenerateOutput A list of strings containing the generated responses for the input. Source code in src/distilabel/models/llms/openai.py def _generations_from_openai_completion(\n self, completion: \"OpenAIChatCompletion\"\n) -> \"GenerateOutput\":\n \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n Args:\n completion: the completion object to get the generations from.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n generations = []\n logprobs = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using OpenAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n logprobs.append(choice_logprobs)\n\n statistics = self._get_llm_statistics(completion)\n return prepare_output(\n generations=generations,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=logprobs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, max_new_tokens=128, logprobs=False, top_logprobs=None, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None, **kwargs) ","text":"Uses the OpenAI batch API to generate num_generations responses for the given inputs. Parameters: Name Type Description Default inputs Union[List[FormattedInput], None] a list of inputs in chat format to generate responses for. None num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 logprobs bool whether to return the log probabilities or not. Defaults to False . False top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. Defaults to None . None frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, List[str]]] a string or a list of strings to use as a stop sequence for the generation. Defaults to None . None response_format Optional[str] the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to text . None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input List[GenerateOutput] in inputs . Raises: Type Description DistilabelOfflineBatchGenerationNotFinishedException if the batch generation is not finished yet. ValueError if no job IDs were found to retrieve the results from. Source code in src/distilabel/models/llms/openai.py def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[str] = None,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input\n in `inputs`.\n\n Raises:\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n ValueError: if no job IDs were found to retrieve the results from.\n \"\"\"\n if self.jobs_ids:\n return self._check_and_get_batch_results()\n\n if inputs:\n self.jobs_ids = self._create_jobs(\n inputs=inputs,\n **{\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n \"response_format\": response_format,\n },\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._check_and_get_batch_results","title":"_check_and_get_batch_results() ","text":"Checks the status of the batch jobs and retrieves the results from the OpenAI Batch API. Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Raises: Type Description ValueError if no job IDs were found to retrieve the results from. DistilabelOfflineBatchGenerationNotFinishedException if the batch generation is not finished yet. RuntimeError if the only batch job found failed. Source code in src/distilabel/models/llms/openai.py def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n Batch API.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n Raises:\n ValueError: if no job IDs were found to retrieve the results from.\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n RuntimeError: if the only batch job found failed.\n \"\"\"\n if not self.jobs_ids:\n raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n outputs = []\n for batch_id in self.jobs_ids:\n batch = self._get_openai_batch(batch_id)\n\n if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n self._logger.error( # type: ignore\n f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n )\n if len(self.jobs_ids) == 1:\n self.jobs_ids = None\n raise RuntimeError(\n f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n f\" failed with status '{batch.status}'.\"\n )\n\n continue\n\n outputs.extend(self._retrieve_batch_results(batch))\n\n # sort by `custom_id` to return the results in the same order as the inputs\n outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n return [self._parse_output(output) for output in outputs]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._parse_output","title":"_parse_output(output) ","text":"Parses the output from the OpenAI Batch API into a list of strings. Parameters: Name Type Description Default output Dict[str, Any] the output to parse. required Returns: Type Description GenerateOutput A list of strings containing the generated responses for the input. Source code in src/distilabel/models/llms/openai.py def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n Args:\n output: the output to parse.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n if \"response\" not in output:\n return []\n\n if output[\"response\"][\"status_code\"] != 200:\n return []\n\n return self._generations_from_openai_completion(\n OpenAIChatCompletion(**output[\"response\"][\"body\"])\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._get_openai_batch","title":"_get_openai_batch(batch_id) ","text":"Gets a batch from the OpenAI Batch API. Parameters: Name Type Description Default batch_id str the ID of the batch to retrieve. required Returns: Type Description Batch The batch retrieved from the OpenAI Batch API. Raises: Type Description OpenAIError if there was an error while retrieving the batch from the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n \"\"\"Gets a batch from the OpenAI Batch API.\n\n Args:\n batch_id: the ID of the batch to retrieve.\n\n Returns:\n The batch retrieved from the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while retrieving the batch from the\n OpenAI Batch API.\n \"\"\"\n import openai\n\n try:\n return self._client.batches.retrieve(batch_id)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n )\n raise e\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._retrieve_batch_results","title":"_retrieve_batch_results(batch) ","text":"Retrieves the results of a batch from its output file, parsing the JSONL content into a list of dictionaries. Parameters: Name Type Description Default batch Batch the batch to retrieve the results from. required Returns: Type Description List[Dict[str, Any]] A list of dictionaries containing the results of the batch. Raises: Type Description AssertionError if no output file ID was found in the batch. Source code in src/distilabel/models/llms/openai.py def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n into a list of dictionaries.\n\n Args:\n batch: the batch to retrieve the results from.\n\n Returns:\n A list of dictionaries containing the results of the batch.\n\n Raises:\n AssertionError: if no output file ID was found in the batch.\n \"\"\"\n import openai\n\n assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n try:\n file_response = self._client.files.content(batch.output_file_id)\n return [orjson.loads(line) for line in file_response.text.splitlines()]\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n )\n return []\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jobs","title":"_create_jobs(inputs, **kwargs) ","text":"Creates jobs in the OpenAI Batch API to generate responses for the given inputs. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description Tuple[str, ...] A list of job IDs created in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_jobs(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Tuple[str, ...]:\n \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A list of job IDs created in the OpenAI Batch API.\n \"\"\"\n batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n jobs = []\n for batch_input_file in batch_input_files:\n if batch := self._create_batch_api_job(batch_input_file):\n jobs.append(batch.id)\n return tuple(jobs)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_api_job","title":"_create_batch_api_job(batch_input_file) ","text":"Creates a job in the OpenAI Batch API to generate responses for the given input file. Parameters: Name Type Description Default batch_input_file FileObject the input file to generate responses for. required Returns: Type Description Union[Batch, None] The batch job created in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_batch_api_job(\n self, batch_input_file: \"OpenAIFileObject\"\n) -> Union[\"OpenAIBatch\", None]:\n \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n file.\n\n Args:\n batch_input_file: the input file to generate responses for.\n\n Returns:\n The batch job created in the OpenAI Batch API.\n \"\"\"\n import openai\n\n metadata = {\"description\": \"distilabel\"}\n\n if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n batch = None\n try:\n batch = self._client.batches.create(\n completion_window=\"24h\",\n endpoint=\"/v1/chat/completions\",\n input_file_id=batch_input_file.id,\n metadata=metadata,\n )\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI Batch API job for file with ID\"\n f\" '{batch_input_file.id}': {e}.\"\n )\n raise e\n return batch\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_files","title":"_create_batch_files(inputs, **kwargs) ","text":"Creates the necessary input files for the batch API to generate responses. The maximum size of each file so the OpenAI Batch API can process it is 100MB, so we need to split the inputs into multiple files if necessary. More information: https://platform.openai.com/docs/api-reference/files/create Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for, optionally including structured output. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description List[FileObject] The list of file objects created for the OpenAI Batch API. Raises: Type Description OpenAIError if there was an error while creating the batch input file in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_batch_files(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> List[\"OpenAIFileObject\"]:\n \"\"\"Creates the necessary input files for the batch API to generate responses. The\n maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n need to split the inputs into multiple files if necessary.\n\n More information: https://platform.openai.com/docs/api-reference/files/create\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n The list of file objects created for the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while creating the batch input file\n in the OpenAI Batch API.\n \"\"\"\n import openai\n\n files = []\n for file_no, buffer in enumerate(\n self._create_jsonl_buffers(inputs=inputs, **kwargs)\n ):\n try:\n # TODO: add distilabel pipeline name and id\n batch_input_file = self._client.files.create(\n file=(self._name_for_openai_files(file_no), buffer),\n purpose=\"batch\",\n )\n files.append(batch_input_file)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI batch input file: {e}\"\n )\n raise e\n return files\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_buffers","title":"_create_jsonl_buffers(inputs, **kwargs) ","text":"Creates a generator of buffers containing the JSONL formatted inputs to be used by the OpenAI Batch API. The buffers created are of size 100MB or less. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for, optionally including structured output. required kwargs Any the keyword arguments to use for the generation. {} Yields: Type Description BytesIO A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch BytesIO API. Source code in src/distilabel/models/llms/openai.py def _create_jsonl_buffers(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Generator[io.BytesIO, None, None]:\n \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Yields:\n A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n API.\n \"\"\"\n buffer = io.BytesIO()\n buffer_current_size = 0\n for i, input in enumerate(inputs):\n # We create the smallest `custom_id` so we don't increase the size of the file\n # to much, but we can still sort the results with the order of the inputs.\n row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n row_size = len(row)\n if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n buffer.seek(0)\n yield buffer\n buffer = io.BytesIO()\n buffer_current_size = 0\n buffer.write(row)\n buffer_current_size += row_size\n\n if buffer_current_size > 0:\n buffer.seek(0)\n yield buffer\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_row","title":"_create_jsonl_row(input, custom_id, **kwargs) ","text":"Creates a JSONL formatted row to be used by the OpenAI Batch API. Parameters: Name Type Description Default input FormattedInput a list of inputs in chat format to generate responses for, optionally including structured output. required custom_id str a custom ID to use for the row. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description bytes A JSONL formatted row to be used by the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_jsonl_row(\n self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n) -> bytes:\n \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n Args:\n input: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n custom_id: a custom ID to use for the row.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A JSONL formatted row to be used by the OpenAI Batch API.\n \"\"\"\n # TODO: depending on the format of the input, add `response_format` to the kwargs\n row = {\n \"custom_id\": custom_id,\n \"method\": \"POST\",\n \"url\": \"/v1/chat/completions\",\n \"body\": {\"messages\": input, **kwargs},\n }\n json_row = orjson.dumps(row)\n return json_row + b\"\\n\"\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TogetherLLM","title":"TogetherLLM ","text":" Bases: OpenAILLM TogetherLLM LLM implementation running the async API client of OpenAI. Attributes: Name Type Description model the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here. base_url Optional[RuntimeParameter[str]] the base URL to use for the Together API can be set with TOGETHER_BASE_URL . Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set. _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. Examples: Generate text: from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/together.py class TogetherLLM(OpenAILLM):\n \"\"\"TogetherLLM LLM implementation running the async API client of OpenAI.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\".\n Supported models can be found [here](https://api.together.xyz/models).\n base_url: the base URL to use for the Together API can be set with `TOGETHER_BASE_URL`.\n Defaults to `None` which means that the value set for the environment variable\n `TOGETHER_BASE_URL` will be used, or \"https://api.together.xyz/v1\" if not set.\n api_key: the API key to authenticate the requests to the Together API. Defaults to `None`\n which means that the value set for the environment variable `TOGETHER_API_KEY` will be\n used, or `None` if not set.\n _api_key_env_var: the name of the environment variable to use for the API key. It\n is meant to be used internally.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnyscaleLLM\n\n llm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"TOGETHER_BASE_URL\", \"https://api.together.xyz/v1\"\n ),\n description=\"The base URL to use for the Together API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_TOGETHER_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Together API.\",\n )\n\n _api_key_env_var: str = PrivateAttr(_TOGETHER_API_KEY_ENV_VAR_NAME)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM","title":"VertexAILLM ","text":" Bases: AsyncLLM VertexAI LLM implementation running the async API clients for Gemini. - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini
To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods: - Setting
GOOGLE_CLOUD_CREDENTIALS environment variable - Using
gcloud auth application-default login command - Using
vertexai.init function from the google-cloud-aiplatform library Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models. _aclient Optional[GenerativeModel] the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method. Icon :simple-googlecloud: Examples: Generate text: from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/vertexai.py class VertexAILLM(AsyncLLM):\n \"\"\"VertexAI LLM implementation running the async API clients for Gemini.\n\n - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini\n\n To use the `VertexAILLM` is necessary to have configured the Google Cloud authentication\n using one of these methods:\n\n - Setting `GOOGLE_CLOUD_CREDENTIALS` environment variable\n - Using `gcloud auth application-default login` command\n - Using `vertexai.init` function from the `google-cloud-aiplatform` library\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". [Supported models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models).\n _aclient: the `GenerativeModel` to use for the Vertex AI Gemini API. It is meant\n to be used internally. Set in the `load` method.\n\n Icon:\n `:simple-googlecloud:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import VertexAILLM\n\n llm = VertexAILLM(model=\"gemini-1.5-pro\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n\n _num_generations_param_supported = False\n\n _aclient: Optional[\"GenerativeModel\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n self._generation_config_class = GenerationConfig\n except ImportError as e:\n raise ImportError(\n \"vertexai is not installed. Please install it using\"\n \" `pip install google-cloud-aiplatform`.\"\n ) from e\n\n if _is_gemini_model(self.model):\n self._aclient = GenerativeModel(model_name=self.model)\n else:\n raise NotImplementedError(\n \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n )\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n \"\"\"Converts a chat type to a list of content items expected by the API.\n\n Args:\n input: the chat type to be converted.\n\n Returns:\n List[str]: a list of content items expected by the API.\n \"\"\"\n from vertexai.generative_models import Content, Part\n\n contents = []\n for message in input:\n if message[\"role\"] not in [\"user\", \"model\"]:\n raise ValueError(\n \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n )\n contents.append(\n Content(\n role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n )\n )\n return contents\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: VertexChatType,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n max_output_tokens: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n safety_settings: Optional[Dict[str, Any]] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n top_k: If specified, top-k sampling will be used. Defaults to `None`.\n max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n stop_sequences: A list of stop sequences. Defaults to `None`.\n safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vertexai.generative_models import GenerationConfig\n\n content: \"GenerationResponse\" = await self._aclient.generate_content_async( # type: ignore\n contents=self._chattype_to_content(input),\n generation_config=GenerationConfig(\n candidate_count=1, # only one candidate allowed per call\n temperature=temperature,\n top_k=top_k,\n top_p=top_p,\n max_output_tokens=max_output_tokens,\n stop_sequences=stop_sequences,\n ),\n safety_settings=safety_settings, # type: ignore\n tools=tools, # type: ignore\n stream=False,\n )\n\n text = None\n try:\n text = content.candidates[0].text\n except ValueError:\n self._logger.warning( # type: ignore\n f\"Received no response using VertexAI client (model: '{self.model}').\"\n f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n )\n return prepare_output([text], **self._get_llm_statistics(content))\n\n @staticmethod\n def _get_llm_statistics(content: \"GenerationResponse\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [content.usage_metadata.prompt_token_count],\n \"output_tokens\": [content.usage_metadata.candidates_token_count],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.load","title":"load() ","text":"Loads the GenerativeModel class which has access to generate_content_async to benefit from async requests. Source code in src/distilabel/models/llms/vertexai.py def load(self) -> None:\n \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n self._generation_config_class = GenerationConfig\n except ImportError as e:\n raise ImportError(\n \"vertexai is not installed. Please install it using\"\n \" `pip install google-cloud-aiplatform`.\"\n ) from e\n\n if _is_gemini_model(self.model):\n self._aclient = GenerativeModel(model_name=self.model)\n else:\n raise NotImplementedError(\n \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM._chattype_to_content","title":"_chattype_to_content(input) ","text":"Converts a chat type to a list of content items expected by the API. Parameters: Name Type Description Default input StandardInput the chat type to be converted. required Returns: Type Description List[Content] List[str]: a list of content items expected by the API. Source code in src/distilabel/models/llms/vertexai.py def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n \"\"\"Converts a chat type to a list of content items expected by the API.\n\n Args:\n input: the chat type to be converted.\n\n Returns:\n List[str]: a list of content items expected by the API.\n \"\"\"\n from vertexai.generative_models import Content, Part\n\n contents = []\n for message in input:\n if message[\"role\"] not in [\"user\", \"model\"]:\n raise ValueError(\n \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n )\n contents.append(\n Content(\n role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n )\n )\n return contents\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.agenerate","title":"agenerate(input, temperature=None, top_p=None, top_k=None, max_output_tokens=None, stop_sequences=None, safety_settings=None, tools=None) async ","text":"Generates num_generations responses for the given input using the VertexAI async client definition. Parameters: Name Type Description Default input VertexChatType a single input in chat format to generate responses for. required temperature Optional[float] Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to None . None top_p Optional[float] If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to None . None top_k Optional[int] If specified, top-k sampling will be used. Defaults to None . None max_output_tokens Optional[int] The maximum number of output tokens to generate per message. Defaults to None . None stop_sequences Optional[List[str]] A list of stop sequences. Defaults to None . None safety_settings Optional[Dict[str, Any]] Safety configuration for returned content from the API. Defaults to None . None tools Optional[List[Dict[str, Any]]] A potential list of tools that can be used by the API. Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vertexai.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: VertexChatType,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n max_output_tokens: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n safety_settings: Optional[Dict[str, Any]] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n top_k: If specified, top-k sampling will be used. Defaults to `None`.\n max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n stop_sequences: A list of stop sequences. Defaults to `None`.\n safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vertexai.generative_models import GenerationConfig\n\n content: \"GenerationResponse\" = await self._aclient.generate_content_async( # type: ignore\n contents=self._chattype_to_content(input),\n generation_config=GenerationConfig(\n candidate_count=1, # only one candidate allowed per call\n temperature=temperature,\n top_k=top_k,\n top_p=top_p,\n max_output_tokens=max_output_tokens,\n stop_sequences=stop_sequences,\n ),\n safety_settings=safety_settings, # type: ignore\n tools=tools, # type: ignore\n stream=False,\n )\n\n text = None\n try:\n text = content.candidates[0].text\n except ValueError:\n self._logger.warning( # type: ignore\n f\"Received no response using VertexAI client (model: '{self.model}').\"\n f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n )\n return prepare_output([text], **self._get_llm_statistics(content))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM","title":"ClientvLLM ","text":" Bases: OpenAILLM , MagpieChatTemplateMixin A client for the vLLM server implementing the OpenAI API specification. Attributes: Name Type Description base_url the base URL of the vLLM server. Defaults to \"http://localhost:8000\" . max_retries the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout the maximum time in seconds to wait for a response from the API. Defaults to 120 . httpx_client_kwargs extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . tokenizer Optional[str] the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None . tokenizer_revision Optional[str] the revision of the tokenizer to load. Defaults to None . _aclient Optional[str] the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None . Runtime parameters base_url : the base url of the vLLM server. Defaults to \"http://localhost:8000\" . max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . httpx_client_kwargs : extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . Examples: Generate text: from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n)\n# [\n# [\n# \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n# \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n# \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n# ]\n# ]\n Source code in src/distilabel/models/llms/vllm.py class ClientvLLM(OpenAILLM, MagpieChatTemplateMixin):\n \"\"\"A client for the `vLLM` server implementing the OpenAI API specification.\n\n Attributes:\n base_url: the base URL of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n httpx_client_kwargs: extra kwargs that will be passed to the `httpx.AsyncClient`\n created to comunicate with the `vLLM` server. Defaults to `None`.\n tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used\n to apply the chat template and tokenize the inputs before sending it to the\n server. Defaults to `None`.\n tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n _aclient: the `httpx.AsyncClient` used to comunicate with the `vLLM` server. Defaults\n to `None`.\n\n Runtime parameters:\n - `base_url`: the base url of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n - `httpx_client_kwargs`: extra kwargs that will be passed to the `httpx.AsyncClient`\n created to comunicate with the `vLLM` server. Defaults to `None`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import ClientvLLM\n\n llm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n )\n\n llm.load()\n\n results = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n )\n # [\n # [\n # \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n # \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n # \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n # ]\n # ]\n ```\n \"\"\"\n\n model: str = \"\" # Default value so it's not needed to `ClientvLLM(model=\"...\")`\n tokenizer: Optional[str] = None\n tokenizer_revision: Optional[str] = None\n\n # We need the sync client to get the list of models\n _client: \"OpenAI\" = PrivateAttr(None)\n _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n optionally.\"\"\"\n\n self.api_key = SecretStr(\"EMPTY\")\n\n # We need to first create the sync client to get the model name that will be used\n # in the `super().load()` when creating the logger.\n try:\n from openai import OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n super().load()\n\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"To use `ClientvLLM` you need to install `transformers`.\"\n \"Please install it using `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.tokenizer, revision=self.tokenizer_revision\n )\n\n @cached_property\n def model_name(self) -> str: # type: ignore\n \"\"\"Returns the name of the model served with vLLM server.\"\"\"\n models = self._client.models.list()\n return models.data[0].id\n\n def _prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n logit_bias: Optional[Dict[str, int]] = None,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n Defaults to ``\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: nucleus sampling. The value refers to the top-p tokens that should be\n considered for sampling. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n completion = await self._aclient.completions.create(\n model=self.model_name,\n prompt=self._prepare_input(input), # type: ignore\n n=num_generations,\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n )\n\n generations = []\n for choice in completion.choices:\n text = choice.text\n if text == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response from vLLM server (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(text)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.model_name","title":"model_name: str cached property ","text":"Returns the name of the model served with vLLM server. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.load","title":"load() ","text":"Creates an httpx.AsyncClient to connect to the vLLM server and a tokenizer optionally. Source code in src/distilabel/models/llms/vllm.py def load(self) -> None:\n \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n optionally.\"\"\"\n\n self.api_key = SecretStr(\"EMPTY\")\n\n # We need to first create the sync client to get the model name that will be used\n # in the `super().load()` when creating the logger.\n try:\n from openai import OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n super().load()\n\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"To use `ClientvLLM` you need to install `transformers`.\"\n \"Please install it using `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.tokenizer, revision=self.tokenizer_revision\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM._prepare_input","title":"_prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/vllm.py def _prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, logit_bias=None, presence_penalty=0.0, temperature=1.0, top_p=1.0) async ","text":"Generates num_generations responses for each input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 logit_bias Optional[Dict[str, int]] modify the likelihood of specified tokens appearing in the completion. Defaults to `` None presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float nucleus sampling. The value refers to the top-p tokens that should be considered for sampling. Defaults to 1.0 . 1.0 Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vllm.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n logit_bias: Optional[Dict[str, int]] = None,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n Defaults to ``\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: nucleus sampling. The value refers to the top-p tokens that should be\n considered for sampling. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n completion = await self._aclient.completions.create(\n model=self.model_name,\n prompt=self._prepare_input(input), # type: ignore\n n=num_generations,\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n )\n\n generations = []\n for choice in completion.choices:\n text = choice.text\n if text == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response from vLLM server (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(text)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM","title":"vLLM ","text":" Bases: LLM , MagpieChatTemplateMixin , CudaDevicePlacementMixin vLLM library LLM implementation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. dtype str the data type to use for the model. Defaults to auto . trust_remote_code bool whether to trust the remote code when loading the model. Defaults to False . quantization Optional[str] the quantization mode to use for the model. Defaults to None . revision Optional[str] the revision of the model to load. Defaults to None . tokenizer Optional[str] the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None . tokenizer_mode Literal['auto', 'slow'] the mode to use for the tokenizer. Defaults to auto . tokenizer_revision Optional[str] the revision of the tokenizer to load. Defaults to None . skip_tokenizer_init bool whether to skip the initialization of the tokenizer. Defaults to False . chat_template Optional[str] a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. seed int the seed to use for the random number generator. Defaults to 0 . extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . _model LLM the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. _tokenizer PreTrainedTokenizer the tokenizer instance used to format the prompt before passing it to the LLM . This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. use_magpie_template PreTrainedTokenizer a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template PreTrainedTokenizer the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . References - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
Runtime parameters extra_kwargs : additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Examples: Generate text: from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/vllm.py class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n \"\"\"`vLLM` library LLM implementation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n dtype: the data type to use for the model. Defaults to `auto`.\n trust_remote_code: whether to trust the remote code when loading the model. Defaults\n to `False`.\n quantization: the quantization mode to use for the model. Defaults to `None`.\n revision: the revision of the model to load. Defaults to `None`.\n tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n the tokenizer files. If not provided, the tokenizer will be loaded from the\n model directory. Defaults to `None`.\n tokenizer_mode: the mode to use for the tokenizer. Defaults to `auto`.\n tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults\n to `False`.\n chat_template: a chat template that will be used to build the prompts before\n sending them to the model. If not provided, the chat template defined in the\n tokenizer config will be used. If not provided and the tokenizer doesn't have\n a chat template, then ChatML template will be used. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n seed: the seed to use for the random number generator. Defaults to `0`.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `LLM` class of `vllm` library. Defaults to `{}`.\n _model: the `vLLM` model instance. This attribute is meant to be used internally\n and should not be accessed directly. It will be set in the `load` method.\n _tokenizer: the tokenizer instance used to format the prompt before passing it to\n the `LLM`. This attribute is meant to be used internally and should not be\n accessed directly. It will be set in the `load` method.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n\n References:\n - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\n\n Runtime parameters:\n - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to\n the `LLM` class of `vllm` library.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import vLLM\n\n # You can pass a custom chat_template to the model\n llm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import vLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n dtype: str = \"auto\"\n trust_remote_code: bool = False\n quantization: Optional[str] = None\n revision: Optional[str] = None\n\n tokenizer: Optional[str] = None\n tokenizer_mode: Literal[\"auto\", \"slow\"] = \"auto\"\n tokenizer_revision: Optional[str] = None\n skip_tokenizer_init: bool = False\n chat_template: Optional[str] = None\n\n seed: int = 0\n\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _model: \"_vLLM\" = PrivateAttr(None)\n _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n _structured_output_logits_processor: Optional[Callable] = PrivateAttr(default=None)\n\n def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n default value is ChatML format, unless explicitly provided.\n \"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n tokenizer=self.tokenizer,\n tokenizer_mode=self.tokenizer_mode,\n tokenizer_revision=self.tokenizer_revision,\n skip_tokenizer_init=self.skip_tokenizer_init,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n self._tokenizer = self._model.get_tokenizer() # type: ignore\n if self.chat_template is not None:\n self._tokenizer.chat_template = self.chat_template # type: ignore\n\n if self.structured_output:\n self._structured_output_logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n self._cleanup_vllm_model()\n self._model = None # type: ignore\n self._tokenizer = None # type: ignore\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n def _cleanup_vllm_model(self) -> None:\n import torch # noqa\n from vllm.distributed.parallel_state import (\n destroy_distributed_environment,\n destroy_model_parallel,\n )\n\n destroy_model_parallel()\n destroy_distributed_environment()\n del self._model.llm_engine.model_executor\n del self._model\n with contextlib.suppress(AssertionError):\n torch.distributed.destroy_process_group()\n gc.collect()\n if torch.cuda.is_available():\n torch.cuda.empty_cache()\n torch.cuda.synchronize()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._tokenizer.chat_template is None:\n return [item[\"content\"] for item in input if item[\"role\"] == \"user\"][0]\n\n prompt: str = (\n self._tokenizer.apply_chat_template(\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n def _prepare_batches(\n self, inputs: List[\"StructuredInput\"]\n ) -> Tuple[List[Tuple[List[str], \"OutlinesStructuredOutputType\"]], List[int]]:\n \"\"\"Prepares the inputs by grouping them by the structured output.\n\n When we generate structured outputs with schemas obtained from a dataset, we need to\n prepare the data to try to send batches of inputs instead of single inputs to the model\n to take advante of the engine. So we group the inputs by the structured output to be\n passed in the `generate` method.\n\n Args:\n inputs: The batch of inputs passed to the generate method. As we expect to be generating\n structured outputs, each element will be a tuple containing the instruction and the\n structured output.\n\n Returns:\n The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n Each new tuple will contain instead of the single instruction, a list of instructions\n \"\"\"\n instruction_order = {}\n batches: Dict[str, List[str]] = {}\n for i, (instruction, structured_output) in enumerate(inputs):\n instruction = self.prepare_input(instruction)\n instruction_order[instruction] = i\n\n structured_output = json.dumps(structured_output)\n if structured_output not in batches:\n batches[structured_output] = [instruction]\n else:\n batches[structured_output].append(instruction)\n\n # Built a list with instructions sorted by structured output\n flat_instructions = [\n instruction for _, group in batches.items() for instruction in group\n ]\n\n # Generate the list of indices based on the original order\n sorted_indices = [\n instruction_order[instruction] for instruction in flat_instructions\n ]\n\n return [\n (batch, json.loads(schema)) for schema, batch in batches.items()\n ], sorted_indices\n\n @validate_call\n def generate( # noqa: C901 # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n presence_penalty: float = 0.0,\n frequency_penalty: float = 0.0,\n repetition_penalty: float = 1.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n top_k: int = -1,\n min_p: float = 0.0,\n logprobs: Optional[PositiveInt] = None,\n stop: Optional[List[str]] = None,\n stop_token_ids: Optional[List[int]] = None,\n include_stop_str_in_output: bool = False,\n logits_processors: Optional[LogitsProcessors] = None,\n extra_sampling_params: Optional[Dict[str, Any]] = None,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n repetition_penalty: the repetition penalty to use for the generation Defaults to\n `1.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n logprobs: number of log probabilities to return per output token. If `None`,\n then no log probability won't be returned. Defaults to `None`.\n stop: a list of strings that will be used to stop the generation when found.\n Defaults to `None`.\n stop_token_ids: a list of token ids that will be used to stop the generation\n when found. Defaults to `None`.\n include_stop_str_in_output: whether to include the stop string in the output.\n Defaults to `False`.\n logits_processors: a list of functions to process the logits before sampling.\n Defaults to `None`.\n extra_sampling_params: dictionary with additional arguments to be passed to\n the `SamplingParams` class from `vllm`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vllm import SamplingParams\n\n if not logits_processors:\n logits_processors = []\n\n if extra_sampling_params is None:\n extra_sampling_params = {}\n\n structured_output = None\n\n if isinstance(inputs[0], tuple):\n # Prepare the batches for structured generation\n prepared_batches, sorted_indices = self._prepare_batches(inputs) # type: ignore\n else:\n # Simulate a batch without the structured output content\n prepared_batches = [([self.prepare_input(input) for input in inputs], None)] # type: ignore\n sorted_indices = None\n\n # Case in which we have a single structured output for the dataset\n if self._structured_output_logits_processor:\n logits_processors.append(self._structured_output_logits_processor)\n\n batched_outputs: List[\"LLMOutput\"] = []\n generations = []\n\n for prepared_inputs, structured_output in prepared_batches:\n if self.structured_output is not None and structured_output is not None:\n # TODO: warning\n pass\n\n if structured_output is not None:\n logits_processors.append(\n self._prepare_structured_output(structured_output) # type: ignore\n )\n\n sampling_params = SamplingParams( # type: ignore\n n=num_generations,\n presence_penalty=presence_penalty,\n frequency_penalty=frequency_penalty,\n repetition_penalty=repetition_penalty,\n temperature=temperature,\n top_p=top_p,\n top_k=top_k,\n min_p=min_p,\n max_tokens=max_new_tokens,\n logprobs=logprobs,\n stop=stop,\n stop_token_ids=stop_token_ids,\n include_stop_str_in_output=include_stop_str_in_output,\n logits_processors=logits_processors,\n **extra_sampling_params,\n )\n\n batch_outputs: List[\"RequestOutput\"] = self._model.generate(\n prompts=prepared_inputs,\n sampling_params=sampling_params,\n use_tqdm=False,\n )\n\n # Remove structured output logit processor to avoid stacking structured output\n # logits processors that leads to non-sense generations\n if structured_output is not None:\n logits_processors.pop(-1)\n\n for input, outputs in zip(prepared_inputs, batch_outputs):\n texts, statistics, outputs_logprobs = self._process_outputs(\n input, outputs\n )\n batched_outputs.append(texts)\n generations.append(\n prepare_output(\n generations=texts,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=outputs_logprobs,\n )\n )\n\n if sorted_indices is not None:\n pairs = list(enumerate(sorted_indices))\n pairs.sort(key=lambda x: x[1])\n generations = [generations[original_idx] for original_idx, _ in pairs]\n\n return generations\n\n def _process_outputs(\n self, input: str, outputs: \"RequestOutput\"\n ) -> Tuple[\"LLMOutput\", \"LLMStatistics\", \"LLMLogprobs\"]:\n texts = []\n outputs_logprobs = []\n statistics = {\n \"input_tokens\": [compute_tokens(input, self._tokenizer.encode)]\n * len(outputs.outputs),\n \"output_tokens\": [],\n }\n for output in outputs.outputs:\n texts.append(output.text)\n statistics[\"output_tokens\"].append(len(output.token_ids))\n if output.logprobs is not None:\n outputs_logprobs.append(self._get_llm_logprobs(output))\n return texts, statistics, outputs_logprobs\n\n def _prepare_structured_output(\n self, structured_output: \"OutlinesStructuredOutputType\"\n ) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n assert structured_output is not None, \"`structured_output` cannot be `None`\"\n\n result = prepare_guided_output(structured_output, \"vllm\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n\n def _get_llm_logprobs(self, output: \"CompletionOutput\") -> List[List[\"Logprob\"]]:\n logprobs = []\n for token_logprob in output.logprobs: # type: ignore\n token_logprobs = []\n for logprob in token_logprob.values():\n token_logprobs.append(\n {\"token\": logprob.decoded_token, \"logprob\": logprob.logprob}\n )\n logprobs.append(token_logprobs)\n return logprobs\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.load","title":"load() ","text":"Loads the vLLM model using either the path or the Hugging Face Hub repository id. Additionally, this method also sets the chat_template for the tokenizer, so as to properly parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the default value is ChatML format, unless explicitly provided. Source code in src/distilabel/models/llms/vllm.py def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n default value is ChatML format, unless explicitly provided.\n \"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n tokenizer=self.tokenizer,\n tokenizer_mode=self.tokenizer_mode,\n tokenizer_revision=self.tokenizer_revision,\n skip_tokenizer_init=self.skip_tokenizer_init,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n self._tokenizer = self._model.get_tokenizer() # type: ignore\n if self.chat_template is not None:\n self._tokenizer.chat_template = self.chat_template # type: ignore\n\n if self.structured_output:\n self._structured_output_logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/llms/vllm.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n self._cleanup_vllm_model()\n self._model = None # type: ignore\n self._tokenizer = None # type: ignore\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/vllm.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._tokenizer.chat_template is None:\n return [item[\"content\"] for item in input if item[\"role\"] == \"user\"][0]\n\n prompt: str = (\n self._tokenizer.apply_chat_template(\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_batches","title":"_prepare_batches(inputs) ","text":"Prepares the inputs by grouping them by the structured output. When we generate structured outputs with schemas obtained from a dataset, we need to prepare the data to try to send batches of inputs instead of single inputs to the model to take advante of the engine. So we group the inputs by the structured output to be passed in the generate method. Parameters: Name Type Description Default inputs List[StructuredInput] The batch of inputs passed to the generate method. As we expect to be generating structured outputs, each element will be a tuple containing the instruction and the structured output. required Returns: Type Description List[Tuple[List[str], OutlinesStructuredOutputType]] The prepared batches (sub-batches let's say) to be passed to the generate method. List[int] Each new tuple will contain instead of the single instruction, a list of instructions Source code in src/distilabel/models/llms/vllm.py def _prepare_batches(\n self, inputs: List[\"StructuredInput\"]\n) -> Tuple[List[Tuple[List[str], \"OutlinesStructuredOutputType\"]], List[int]]:\n \"\"\"Prepares the inputs by grouping them by the structured output.\n\n When we generate structured outputs with schemas obtained from a dataset, we need to\n prepare the data to try to send batches of inputs instead of single inputs to the model\n to take advante of the engine. So we group the inputs by the structured output to be\n passed in the `generate` method.\n\n Args:\n inputs: The batch of inputs passed to the generate method. As we expect to be generating\n structured outputs, each element will be a tuple containing the instruction and the\n structured output.\n\n Returns:\n The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n Each new tuple will contain instead of the single instruction, a list of instructions\n \"\"\"\n instruction_order = {}\n batches: Dict[str, List[str]] = {}\n for i, (instruction, structured_output) in enumerate(inputs):\n instruction = self.prepare_input(instruction)\n instruction_order[instruction] = i\n\n structured_output = json.dumps(structured_output)\n if structured_output not in batches:\n batches[structured_output] = [instruction]\n else:\n batches[structured_output].append(instruction)\n\n # Built a list with instructions sorted by structured output\n flat_instructions = [\n instruction for _, group in batches.items() for instruction in group\n ]\n\n # Generate the list of indices based on the original order\n sorted_indices = [\n instruction_order[instruction] for instruction in flat_instructions\n ]\n\n return [\n (batch, json.loads(schema)) for schema, batch in batches.items()\n ], sorted_indices\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, logprobs=None, stop=None, stop_token_ids=None, include_stop_str_in_output=False, logits_processors=None, extra_sampling_params=None) ","text":"Generates num_generations responses for each input. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 repetition_penalty float the repetition penalty to use for the generation Defaults to 1.0 . 1.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 top_k int the top-k value to use for the generation. Defaults to 0 . -1 min_p float the minimum probability to use for the generation. Defaults to 0.0 . 0.0 logprobs Optional[PositiveInt] number of log probabilities to return per output token. If None , then no log probability won't be returned. Defaults to None . None stop Optional[List[str]] a list of strings that will be used to stop the generation when found. Defaults to None . None stop_token_ids Optional[List[int]] a list of token ids that will be used to stop the generation when found. Defaults to None . None include_stop_str_in_output bool whether to include the stop string in the output. Defaults to False . False logits_processors Optional[LogitsProcessors] a list of functions to process the logits before sampling. Defaults to None . None extra_sampling_params Optional[Dict[str, Any]] dictionary with additional arguments to be passed to the SamplingParams class from vllm . None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vllm.py @validate_call\ndef generate( # noqa: C901 # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n presence_penalty: float = 0.0,\n frequency_penalty: float = 0.0,\n repetition_penalty: float = 1.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n top_k: int = -1,\n min_p: float = 0.0,\n logprobs: Optional[PositiveInt] = None,\n stop: Optional[List[str]] = None,\n stop_token_ids: Optional[List[int]] = None,\n include_stop_str_in_output: bool = False,\n logits_processors: Optional[LogitsProcessors] = None,\n extra_sampling_params: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n repetition_penalty: the repetition penalty to use for the generation Defaults to\n `1.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n logprobs: number of log probabilities to return per output token. If `None`,\n then no log probability won't be returned. Defaults to `None`.\n stop: a list of strings that will be used to stop the generation when found.\n Defaults to `None`.\n stop_token_ids: a list of token ids that will be used to stop the generation\n when found. Defaults to `None`.\n include_stop_str_in_output: whether to include the stop string in the output.\n Defaults to `False`.\n logits_processors: a list of functions to process the logits before sampling.\n Defaults to `None`.\n extra_sampling_params: dictionary with additional arguments to be passed to\n the `SamplingParams` class from `vllm`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vllm import SamplingParams\n\n if not logits_processors:\n logits_processors = []\n\n if extra_sampling_params is None:\n extra_sampling_params = {}\n\n structured_output = None\n\n if isinstance(inputs[0], tuple):\n # Prepare the batches for structured generation\n prepared_batches, sorted_indices = self._prepare_batches(inputs) # type: ignore\n else:\n # Simulate a batch without the structured output content\n prepared_batches = [([self.prepare_input(input) for input in inputs], None)] # type: ignore\n sorted_indices = None\n\n # Case in which we have a single structured output for the dataset\n if self._structured_output_logits_processor:\n logits_processors.append(self._structured_output_logits_processor)\n\n batched_outputs: List[\"LLMOutput\"] = []\n generations = []\n\n for prepared_inputs, structured_output in prepared_batches:\n if self.structured_output is not None and structured_output is not None:\n # TODO: warning\n pass\n\n if structured_output is not None:\n logits_processors.append(\n self._prepare_structured_output(structured_output) # type: ignore\n )\n\n sampling_params = SamplingParams( # type: ignore\n n=num_generations,\n presence_penalty=presence_penalty,\n frequency_penalty=frequency_penalty,\n repetition_penalty=repetition_penalty,\n temperature=temperature,\n top_p=top_p,\n top_k=top_k,\n min_p=min_p,\n max_tokens=max_new_tokens,\n logprobs=logprobs,\n stop=stop,\n stop_token_ids=stop_token_ids,\n include_stop_str_in_output=include_stop_str_in_output,\n logits_processors=logits_processors,\n **extra_sampling_params,\n )\n\n batch_outputs: List[\"RequestOutput\"] = self._model.generate(\n prompts=prepared_inputs,\n sampling_params=sampling_params,\n use_tqdm=False,\n )\n\n # Remove structured output logit processor to avoid stacking structured output\n # logits processors that leads to non-sense generations\n if structured_output is not None:\n logits_processors.pop(-1)\n\n for input, outputs in zip(prepared_inputs, batch_outputs):\n texts, statistics, outputs_logprobs = self._process_outputs(\n input, outputs\n )\n batched_outputs.append(texts)\n generations.append(\n prepare_output(\n generations=texts,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=outputs_logprobs,\n )\n )\n\n if sorted_indices is not None:\n pairs = list(enumerate(sorted_indices))\n pairs.sort(key=lambda x: x[1])\n generations = [generations[original_idx] for original_idx, _ in pairs]\n\n return generations\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output OutlinesStructuredOutputType the configuration dict to prepare the structured output. required Returns: Type Description Union[Callable, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/vllm.py def _prepare_structured_output(\n self, structured_output: \"OutlinesStructuredOutputType\"\n) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n assert structured_output is not None, \"`structured_output` cannot be `None`\"\n\n result = prepare_guided_output(structured_output, \"vllm\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin","title":"CudaDevicePlacementMixin ","text":" Bases: BaseModel Mixin class to assign CUDA devices to the LLM based on the cuda_devices attribute and the device placement information provided in _device_llm_placement_map . Providing the device placement information is optional, but if it is provided, it will be used to assign CUDA devices to the LLM s, trying to avoid using the same device for different LLM s. Attributes: Name Type Description cuda_devices RuntimeParameter[Union[List[int], Literal['auto']]] a list with the ID of the CUDA devices to be used by the LLM . If set to \"auto\", the devices will be automatically assigned based on the device placement information provided in _device_llm_placement_map . If set to a list of devices, it will be checked if the devices are available to be used by the LLM . If not, a warning will be logged. disable_cuda_device_placement RuntimeParameter[bool] Whether to disable the CUDA device placement logic or not. Defaults to False . _llm_identifier Union[str, None] the identifier of the LLM to be used as key in _device_llm_placement_map . _device_llm_placement_map Generator[Dict[str, List[int]], None, None] a dictionary with the device placement information for each LLM . Source code in src/distilabel/models/mixins/cuda_device_placement.py class CudaDevicePlacementMixin(BaseModel):\n \"\"\"Mixin class to assign CUDA devices to the `LLM` based on the `cuda_devices` attribute\n and the device placement information provided in `_device_llm_placement_map`. Providing\n the device placement information is optional, but if it is provided, it will be used to\n assign CUDA devices to the `LLM`s, trying to avoid using the same device for different\n `LLM`s.\n\n Attributes:\n cuda_devices: a list with the ID of the CUDA devices to be used by the `LLM`. If set\n to \"auto\", the devices will be automatically assigned based on the device\n placement information provided in `_device_llm_placement_map`. If set to a list\n of devices, it will be checked if the devices are available to be used by the\n `LLM`. If not, a warning will be logged.\n disable_cuda_device_placement: Whether to disable the CUDA device placement logic\n or not. Defaults to `False`.\n _llm_identifier: the identifier of the `LLM` to be used as key in `_device_llm_placement_map`.\n _device_llm_placement_map: a dictionary with the device placement information for each\n `LLM`.\n \"\"\"\n\n cuda_devices: RuntimeParameter[Union[List[int], Literal[\"auto\"]]] = Field(\n default=\"auto\", description=\"A list with the ID of the CUDA devices to be used.\"\n )\n disable_cuda_device_placement: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to disable the CUDA device placement logic or not.\",\n )\n\n _llm_identifier: Union[str, None] = PrivateAttr(default=None)\n _desired_num_gpus: PositiveInt = PrivateAttr(default=1)\n _available_cuda_devices: List[int] = PrivateAttr(default_factory=list)\n _can_check_cuda_devices: bool = PrivateAttr(default=False)\n\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`.\"\"\"\n\n if self.disable_cuda_device_placement:\n return\n\n try:\n import pynvml\n\n pynvml.nvmlInit()\n device_count = pynvml.nvmlDeviceGetCount()\n self._available_cuda_devices = list(range(device_count))\n self._can_check_cuda_devices = True\n except ImportError as ie:\n if self.cuda_devices == \"auto\":\n raise ImportError(\n \"The 'pynvml' library is not installed. It is required to automatically\"\n \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n ) from ie\n\n if self.cuda_devices:\n self._logger.warning( # type: ignore\n \"The 'pynvml' library is not installed. It is recommended to install it\"\n \" to check if the CUDA devices assigned to the LLM are available.\"\n )\n\n self._assign_cuda_devices()\n\n def unload(self) -> None:\n \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n placement information provided in `_device_llm_placement_map`.\"\"\"\n if self.disable_cuda_device_placement:\n return\n\n with self._device_llm_placement_map() as device_map:\n if self._llm_identifier in device_map:\n self._logger.debug( # type: ignore\n f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n )\n del device_map[self._llm_identifier]\n\n @contextmanager\n def _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n \"\"\"Reads the content of the device placement file of the node with a lock, yields\n the content, and writes the content back to the file after the context manager is\n closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n Yields:\n The content of the device placement file.\n \"\"\"\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n with portalocker.Lock(\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n \"r+\",\n flags=portalocker.LockFlags.EXCLUSIVE,\n ) as f:\n try:\n content = json.load(f)\n except json.JSONDecodeError:\n content = {}\n yield content\n f.seek(0)\n f.truncate()\n f.write(json.dumps(content))\n\n def _assign_cuda_devices(self) -> None:\n \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n will be set to the first available CUDA device that is not going to be used by any\n other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n checked if the devices are available to be used by the LLM. If not, a warning will be\n logged.\"\"\"\n\n # Take the lock and read the device placement information for each LLM.\n with self._device_llm_placement_map() as device_map:\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n for _ in range(self._desired_num_gpus):\n if (device_id := self._get_cuda_device(device_map)) is not None:\n self.cuda_devices.append(device_id)\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n if len(self.cuda_devices) != self._desired_num_gpus:\n self._logger.warning( # type: ignore\n f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n f\" for LLM with identifier '{self._llm_identifier}'.\"\n )\n else:\n self._check_cuda_devices(device_map)\n\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n\n # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n\n self._set_cuda_visible_devices()\n\n def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n \"\"\"\n for device in self.cuda_devices: # type: ignore\n for llm, devices in device_map.items():\n if device in devices:\n self._logger.warning( # type: ignore\n f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n f\"'{device}'. This may lead to performance issues or running out\"\n \" of memory depending on the device capabilities and the loaded\"\n \" models.\"\n )\n\n def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n to be used by any other LLM.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n\n Returns:\n The first available CUDA device to be used by the LLM.\n\n Raises:\n RuntimeError: if there is no available CUDA device to be used by the LLM.\n \"\"\"\n for device in self._available_cuda_devices:\n if all(device not in devices for devices in device_map.values()):\n return device\n\n return None\n\n def _set_cuda_visible_devices(self) -> None:\n \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n to be used by the LLM.\n \"\"\"\n if not self.cuda_devices:\n return\n\n if self._can_check_cuda_devices and not all(\n device in self._available_cuda_devices for device in self.cuda_devices\n ):\n raise RuntimeError(\n f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n \" the 'cuda_devices' attribute and try again.\"\n )\n\n cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n self._logger.info( # type: ignore\n f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n f\" {self.cuda_devices}.\"\n )\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.load","title":"load() ","text":"Assign CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map . Source code in src/distilabel/models/mixins/cuda_device_placement.py def load(self) -> None:\n \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`.\"\"\"\n\n if self.disable_cuda_device_placement:\n return\n\n try:\n import pynvml\n\n pynvml.nvmlInit()\n device_count = pynvml.nvmlDeviceGetCount()\n self._available_cuda_devices = list(range(device_count))\n self._can_check_cuda_devices = True\n except ImportError as ie:\n if self.cuda_devices == \"auto\":\n raise ImportError(\n \"The 'pynvml' library is not installed. It is required to automatically\"\n \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n ) from ie\n\n if self.cuda_devices:\n self._logger.warning( # type: ignore\n \"The 'pynvml' library is not installed. It is recommended to install it\"\n \" to check if the CUDA devices assigned to the LLM are available.\"\n )\n\n self._assign_cuda_devices()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.unload","title":"unload() ","text":"Unloads the LLM and removes the CUDA devices assigned to it from the device placement information provided in _device_llm_placement_map . Source code in src/distilabel/models/mixins/cuda_device_placement.py def unload(self) -> None:\n \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n placement information provided in `_device_llm_placement_map`.\"\"\"\n if self.disable_cuda_device_placement:\n return\n\n with self._device_llm_placement_map() as device_map:\n if self._llm_identifier in device_map:\n self._logger.debug( # type: ignore\n f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n )\n del device_map[self._llm_identifier]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._device_llm_placement_map","title":"_device_llm_placement_map() ","text":"Reads the content of the device placement file of the node with a lock, yields the content, and writes the content back to the file after the context manager is closed. If the file doesn't exist, an empty dictionary will be yielded. Yields: Type Description Dict[str, List[int]] The content of the device placement file. Source code in src/distilabel/models/mixins/cuda_device_placement.py @contextmanager\ndef _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n \"\"\"Reads the content of the device placement file of the node with a lock, yields\n the content, and writes the content back to the file after the context manager is\n closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n Yields:\n The content of the device placement file.\n \"\"\"\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n with portalocker.Lock(\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n \"r+\",\n flags=portalocker.LockFlags.EXCLUSIVE,\n ) as f:\n try:\n content = json.load(f)\n except json.JSONDecodeError:\n content = {}\n yield content\n f.seek(0)\n f.truncate()\n f.write(json.dumps(content))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._assign_cuda_devices","title":"_assign_cuda_devices() ","text":"Assigns CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map . If the cuda_devices attribute is set to \"auto\", it will be set to the first available CUDA device that is not going to be used by any other LLM. If the cuda_devices attribute is set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _assign_cuda_devices(self) -> None:\n \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n will be set to the first available CUDA device that is not going to be used by any\n other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n checked if the devices are available to be used by the LLM. If not, a warning will be\n logged.\"\"\"\n\n # Take the lock and read the device placement information for each LLM.\n with self._device_llm_placement_map() as device_map:\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n for _ in range(self._desired_num_gpus):\n if (device_id := self._get_cuda_device(device_map)) is not None:\n self.cuda_devices.append(device_id)\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n if len(self.cuda_devices) != self._desired_num_gpus:\n self._logger.warning( # type: ignore\n f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n f\" for LLM with identifier '{self._llm_identifier}'.\"\n )\n else:\n self._check_cuda_devices(device_map)\n\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n\n # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n\n self._set_cuda_visible_devices()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._check_cuda_devices","title":"_check_cuda_devices(device_map) ","text":"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs. Parameters: Name Type Description Default device_map Dict[str, List[int]] a dictionary with the device placement information for each LLM. required Source code in src/distilabel/models/mixins/cuda_device_placement.py def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n \"\"\"\n for device in self.cuda_devices: # type: ignore\n for llm, devices in device_map.items():\n if device in devices:\n self._logger.warning( # type: ignore\n f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n f\"'{device}'. This may lead to performance issues or running out\"\n \" of memory depending on the device capabilities and the loaded\"\n \" models.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._get_cuda_device","title":"_get_cuda_device(device_map) ","text":"Returns the first available CUDA device to be used by the LLM that is not going to be used by any other LLM. Parameters: Name Type Description Default device_map Dict[str, List[int]] a dictionary with the device placement information for each LLM. required Returns: Type Description Union[int, None] The first available CUDA device to be used by the LLM. Raises: Type Description RuntimeError if there is no available CUDA device to be used by the LLM. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n to be used by any other LLM.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n\n Returns:\n The first available CUDA device to be used by the LLM.\n\n Raises:\n RuntimeError: if there is no available CUDA device to be used by the LLM.\n \"\"\"\n for device in self._available_cuda_devices:\n if all(device not in devices for devices in device_map.values()):\n return device\n\n return None\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._set_cuda_visible_devices","title":"_set_cuda_visible_devices() ","text":"Sets the CUDA_VISIBLE_DEVICES environment variable to the list of CUDA devices to be used by the LLM. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _set_cuda_visible_devices(self) -> None:\n \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n to be used by the LLM.\n \"\"\"\n if not self.cuda_devices:\n return\n\n if self._can_check_cuda_devices and not all(\n device in self._available_cuda_devices for device in self.cuda_devices\n ):\n raise RuntimeError(\n f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n \" the 'cuda_devices' attribute and try again.\"\n )\n\n cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n self._logger.info( # type: ignore\n f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n f\" {self.cuda_devices}.\"\n )\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n "},{"location":"api/pipeline/","title":"Pipeline","text":"This section contains the API reference for the distilabel pipelines. For an example on how to use the pipelines, see the Tutorial - Pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base","title":"base ","text":""},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline","title":"BasePipeline ","text":" Bases: ABC , RequirementsMixin , _Serializable Base class for a distilabel pipeline. Attributes: Name Type Description name The name of the pipeline. description A description of the pipeline. dag The DAG instance that represents the pipeline. _cache_dir The directory where the pipeline will be cached. _logger The logger instance that will be used by the pipeline. _batch_manager Optional[_BatchManager] The batch manager that will manage the batches received from the steps while running the pipeline. It will be created when the pipeline is run, from scratch or from cache. Defaults to None . _write_buffer Optional[_WriteBuffer] The buffer that will store the data of the leaf steps of the pipeline while running, so the Distiset can be created at the end. It will be created when the pipeline is run. Defaults to None . _fs Optional[AbstractFileSystem] The fsspec filesystem to be used to store the data of the _Batch es passed between the steps. It will be set when the pipeline is run. Defaults to None . _storage_base_path Optional[str] The base path where the data of the _Batch es passed between the steps will be stored. It will be set then the pipeline is run. Defaults to None . _use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . _dry_run A flag to indicate if the pipeline is running in dry run mode. Defaults to False . output_queue A queue to store the output of the steps while running the pipeline. load_queue A queue used by each Step to notify the main process it has finished loading or it the step has been unloaded. Source code in src/distilabel/pipeline/base.py class BasePipeline(ABC, RequirementsMixin, _Serializable):\n \"\"\"Base class for a `distilabel` pipeline.\n\n Attributes:\n name: The name of the pipeline.\n description: A description of the pipeline.\n dag: The `DAG` instance that represents the pipeline.\n _cache_dir: The directory where the pipeline will be cached.\n _logger: The logger instance that will be used by the pipeline.\n _batch_manager: The batch manager that will manage the batches received from the\n steps while running the pipeline. It will be created when the pipeline is run,\n from scratch or from cache. Defaults to `None`.\n _write_buffer: The buffer that will store the data of the leaf steps of the pipeline\n while running, so the `Distiset` can be created at the end. It will be created\n when the pipeline is run. Defaults to `None`.\n _fs: The `fsspec` filesystem to be used to store the data of the `_Batch`es passed\n between the steps. It will be set when the pipeline is run. Defaults to `None`.\n _storage_base_path: The base path where the data of the `_Batch`es passed between\n the steps will be stored. It will be set then the pipeline is run. Defaults\n to `None`.\n _use_fs_to_pass_data: Whether to use the file system to pass the data of the\n `_Batch`es between the steps. Even if this parameter is `False`, the `Batch`es\n received by `GlobalStep`s will always use the file system to pass the data.\n Defaults to `False`.\n _dry_run: A flag to indicate if the pipeline is running in dry run mode. Defaults\n to `False`.\n output_queue: A queue to store the output of the steps while running the pipeline.\n load_queue: A queue used by each `Step` to notify the main process it has finished\n loading or it the step has been unloaded.\n \"\"\"\n\n _output_queue: \"Queue[Any]\"\n _load_queue: \"Queue[Union[StepLoadStatus, None]]\"\n\n def __init__(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n enable_metadata: bool = False,\n requirements: Optional[List[str]] = None,\n ) -> None:\n \"\"\"Initialize the `BasePipeline` instance.\n\n Args:\n name: The name of the pipeline. If not generated, a random one will be generated by default.\n description: A description of the pipeline. Defaults to `None`.\n cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n enable_metadata: Whether to include the distilabel metadata column for the pipeline\n in the final `Distiset`. It contains metadata used by distilabel, for example\n the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n field. Defaults to `False`.\n requirements: List of requirements that must be installed to run the pipeline.\n Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n that this requirements must be installed.\n \"\"\"\n self.name = name or _PIPELINE_DEFAULT_NAME\n self.description = description\n self._enable_metadata = enable_metadata\n self.dag = DAG()\n\n if cache_dir:\n self._cache_dir = Path(cache_dir)\n elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n self._cache_dir = Path(env_cache_dir)\n else:\n self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n self._batch_manager: Optional[\"_BatchManager\"] = None\n self._write_buffer: Optional[\"_WriteBuffer\"] = None\n self._steps_input_queues: Dict[str, \"Queue\"] = {}\n\n self._steps_load_status: Dict[str, int] = {}\n self._steps_load_status_lock = threading.Lock()\n\n self._stop_called = False\n self._stop_called_lock = threading.Lock()\n self._stop_calls = 0\n\n self._recover_offline_batch_generate_for_step: Union[\n Tuple[str, List[List[Dict[str, Any]]]], None\n ] = None\n\n self._fs: Optional[fsspec.AbstractFileSystem] = None\n self._storage_base_path: Optional[str] = None\n self._use_fs_to_pass_data: bool = False\n self._dry_run = False\n\n self._current_stage = 0\n self._stages_last_batch: List[List[str]] = []\n self._load_groups = []\n\n self.requirements = requirements or []\n\n self._exception: Union[Exception, None] = None\n\n self._log_queue: Union[\"Queue[Any]\", None] = None\n\n def __enter__(self) -> Self:\n \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(self)\n return self\n\n def __exit__(self, exc_type, exc_value, traceback) -> None:\n \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(None)\n self._set_pipeline_name()\n\n def _set_pipeline_name(self) -> None:\n \"\"\"Creates a name for the pipeline if it's the default one (if hasn't been set).\"\"\"\n if self.name == _PIPELINE_DEFAULT_NAME:\n self.name = f\"pipeline_{'_'.join(self.dag)}\"\n\n @property\n def signature(self) -> str:\n \"\"\"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.\n\n The main use is to find the pipeline in the cache folder.\n\n Returns:\n Signature of the pipeline.\n \"\"\"\n\n pipeline_dump = self.dump()[\"pipeline\"]\n steps_names = list(self.dag)\n connections_info = [\n f\"{c['from']}-{'-'.join(c['to'])}\" for c in pipeline_dump[\"connections\"]\n ]\n\n routing_batch_functions_info = []\n for function in pipeline_dump[\"routing_batch_functions\"]:\n step = function[\"step\"]\n routing_batch_function: \"RoutingBatchFunction\" = self.dag.get_step(step)[\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n ]\n if type_info := routing_batch_function._get_type_info():\n step += f\"-{type_info}\"\n routing_batch_functions_info.append(step)\n\n return hashlib.sha1(\n \",\".join(\n steps_names + connections_info + routing_batch_functions_info\n ).encode()\n ).hexdigest()\n\n def run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[logging.Handler]] = None,\n ) -> \"Distiset\": # type: ignore\n \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n the pipeline.\n\n This method should be extended by the specific pipeline implementation,\n adding the logic to run the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n \"\"\"\n\n self._exception: Union[Exception, None] = None\n\n # Set the runtime parameters that will be used during the pipeline execution.\n # They are used to generate the signature of the pipeline that is used to hit the\n # cache when the pipeline is run, so it's important to do it first.\n self._set_runtime_parameters(parameters or {})\n\n self._refresh_pipeline_from_cache()\n\n if dataset is not None:\n self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n setup_logging(\n log_queue=self._log_queue,\n filename=str(self._cache_location[\"log_file\"]),\n logging_handlers=logging_handlers,\n )\n\n # Set the name of the pipeline if it's the default one. This should be called\n # if the pipeline is defined within the context manager, and the run is called\n # outside of it. Is here in the following case:\n # with Pipeline() as pipeline:\n # pipeline.run()\n self._set_pipeline_name()\n\n # Validate the pipeline DAG to check that all the steps are chainable, there are\n # no missing runtime parameters, batch sizes are correct, load groups are valid,\n # etc.\n self._load_groups = self._built_load_groups(load_groups)\n self._validate()\n\n self._set_pipeline_artifacts_path_in_steps()\n\n # Set the initial load status for all the steps\n self._init_steps_load_status()\n\n # Load the stages status or initialize it\n self._load_stages_status(use_cache)\n\n # Load the `_BatchManager` from cache or create one from scratch\n self._load_batch_manager(use_cache)\n\n # Check pipeline requirements are installed\n self._check_requirements()\n\n # Setup the filesystem that will be used to pass the data of the `_Batch`es\n self._setup_fsspec(storage_parameters)\n self._use_fs_to_pass_data = use_fs_to_pass_data\n\n if self._dry_run:\n self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n # If the batch manager is not able to generate batches, that means that the loaded\n # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n # the previous pipeline execution was completed successfully.\n if not self._batch_manager.can_generate(): # type: ignore\n self._logger.info(\n \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n \" Returning `Distiset` from cache data...\"\n )\n distiset = create_distiset(\n data_dir=self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n stop_logging()\n return distiset\n\n self._setup_write_buffer(use_cache)\n\n self._print_load_stages_info()\n\n def dry_run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n batch_size: int = 1,\n dataset: Optional[\"InputDataset\"] = None,\n ) -> \"Distiset\":\n \"\"\"Do a dry run to test the pipeline runs as expected.\n\n Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n to the specified `batch_size`, and run just with a single batch, effectively\n running the whole pipeline with a single example. The cache will be set to `False`.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n batch_size: The batch size of the unique batch generated by the generators\n steps of the pipeline. Defaults to `1`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n\n Returns:\n Will return the `Distiset` as the main run method would do.\n \"\"\"\n self._dry_run = True\n\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n if step.is_generator:\n if not parameters:\n parameters = {}\n parameters[step_name] = {\"batch_size\": batch_size}\n\n distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n self._dry_run = False\n return distiset\n\n def get_load_stages(self, load_groups: Optional[\"LoadGroups\"] = None) -> LoadStages:\n \"\"\"Convenient method to get the load stages of a pipeline.\n\n Args:\n load_groups: A list containing list of steps that has to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n Defaults to `None`.\n\n Returns:\n A tuple with the first element containing asorted list by stage containing\n lists with the names of the steps of the stage, and the second element a list\n sorted by stage containing lists with the names of the last steps of the stage.\n \"\"\"\n load_groups = self._built_load_groups(load_groups)\n return self.dag.get_steps_load_stages(load_groups)\n\n def _add_dataset_generator_step(\n self, dataset: \"InputDataset\", batch_size: int = 50\n ) -> None:\n \"\"\"Create a root step to work as the `GeneratorStep` for the pipeline using a\n dataset.\n\n Args:\n dataset: A dataset that will be used to create a `GeneratorStep` and\n placed in the DAG as the root step.\n batch_size: The size of the batches generated by the `GeneratorStep`.\n\n Raises:\n ValueError: If there's already a `GeneratorStep` in the pipeline.\n \"\"\"\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if isinstance(step_name, GeneratorStep):\n raise DistilabelUserError(\n \"There is already a `GeneratorStep` in the pipeline, you can either\"\n \" pass a `dataset` to the run method, or create a `GeneratorStep` explictly.\"\n f\" `GeneratorStep`: {step}\",\n page=\"sections/how_to_guides/basic/step/#types-of-steps\",\n )\n loader = make_generator_step(\n dataset=dataset,\n pipeline=self,\n batch_size=batch_size,\n )\n self.dag.add_root_step(loader)\n\n def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n Returns:\n A dictionary with the step name as the key and a list of dictionaries with\n the parameter name and the parameter info as the value.\n \"\"\"\n runtime_parameters = {}\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n runtime_parameters[step_name] = step.get_runtime_parameters_info()\n return runtime_parameters\n\n def _built_load_groups(\n self, load_groups: Optional[\"LoadGroups\"] = None\n ) -> List[List[str]]:\n if load_groups is None:\n return []\n\n if load_groups == \"sequential_step_execution\":\n return [[step_name] for step_name in self.dag]\n\n return [\n [\n step.name if isinstance(step, _Step) else step\n for step in steps_load_group\n ] # type: ignore\n for steps_load_group in load_groups\n ]\n\n def _validate(self) -> None:\n \"\"\"Validates the pipeline DAG to check that all the steps are chainable, there are\n no missing runtime parameters, batch sizes are correct and that load groups are\n valid (if any).\"\"\"\n self.dag.validate()\n self._validate_load_groups(self._load_groups)\n\n def _validate_load_groups(self, load_groups: List[List[Any]]) -> None: # noqa: C901\n \"\"\"Checks that the provided load groups are valid and that the steps can be scheduled\n to be loaded in different stages without any issue.\n\n Args:\n load_groups: the load groups to be checked.\n\n Raises:\n DistilabelUserError: if something is not OK when checking the load groups.\n \"\"\"\n\n def check_predecessor_in_load_group(\n step_name: str, load_group: List[str], first: bool\n ) -> Union[str, None]:\n if not first and step_name in load_group:\n return step_name\n\n for predecessor_step_name in self.dag.get_step_predecessors(step_name):\n # Immediate predecessor is in the same load group. This is OK.\n if first and predecessor_step_name in load_group:\n continue\n\n # Case: A -> B -> C, load_group=[A, C]\n # If a non-immediate predecessor is in the same load group and an immediate\n # predecessor is not , then it's not OK because we cannot load `step_name`\n # before one immediate predecessor.\n if step_name_in_load_group := check_predecessor_in_load_group(\n predecessor_step_name, load_group, False\n ):\n return step_name_in_load_group\n\n return None\n\n steps_included_in_load_group = []\n for load_group_num, steps_load_group in enumerate(load_groups):\n for step_name in steps_load_group:\n if step_name not in self.dag.G:\n raise DistilabelUserError(\n f\"Step with name '{step_name}' included in group {load_group_num} of\"\n \" the `load_groups` is not an step included in the pipeline. Please,\"\n \" check that you're passing the correct step name and run again.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n node = self.dag.get_step(step_name)\n step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n\n if step_name_in_load_group := check_predecessor_in_load_group(\n step_name, steps_load_group, True\n ):\n # Improve this user error message\n raise DistilabelUserError(\n f\"Step with name '{step_name}' cannot be in the same load group\"\n f\" as the step with name '{step_name_in_load_group}'. '{step_name_in_load_group}'\"\n f\" is not an immediate predecessor of '{step_name}' and there are\"\n \" immediate predecessors that have not been included.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n if step.is_global and len(steps_load_group) > 1:\n raise DistilabelUserError(\n f\"Global step '{step_name}' has been included in a load group along\"\n \" more steps. Global steps cannot be included in a load group with\"\n \" more steps as they will be loaded in a different stage to the\"\n \" rest of the steps in the pipeline by default.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n if step_name in steps_included_in_load_group:\n raise DistilabelUserError(\n f\"Step with name '{step_name}' in load group {load_group_num} has\"\n \" already been included in a previous load group. A step cannot be in more\"\n \" than one load group.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n steps_included_in_load_group.append(step_name)\n\n def _init_steps_load_status(self) -> None:\n \"\"\"Initialize the `_steps_load_status` dictionary assigning 0 to every step of\n the pipeline.\"\"\"\n for step_name in self.dag:\n self._steps_load_status[step_name] = _STEP_NOT_LOADED_CODE\n\n def _set_pipeline_artifacts_path_in_steps(self) -> None:\n \"\"\"Sets the attribute `_pipeline_artifacts_path` in all the `Step`s of the pipeline,\n so steps can use it to get the path to save the generated artifacts.\"\"\"\n artifacts_path = self._cache_location[\"data\"] / constants.STEPS_ARTIFACTS_PATH\n for name in self.dag:\n step: \"_Step\" = self.dag.get_step(name)[constants.STEP_ATTR_NAME]\n step.set_pipeline_artifacts_path(path=artifacts_path)\n\n def _check_requirements(self) -> None:\n \"\"\"Checks if the dependencies required to run the pipeline are installed.\n\n Raises:\n ModuleNotFoundError: if one or more requirements are missing.\n \"\"\"\n if to_install := self.requirements_to_install():\n # Print the list of requirements like they would appear in a requirements.txt\n to_install_list = \"\\n\" + \"\\n\".join(to_install)\n msg = f\"Please install the following requirements to run the pipeline: {to_install_list}\"\n self._logger.error(msg)\n raise ModuleNotFoundError(msg)\n\n def _setup_fsspec(\n self, storage_parameters: Optional[Dict[str, Any]] = None\n ) -> None:\n \"\"\"Setups the `fsspec` filesystem to be used to store the data of the `_Batch`es\n passed between the steps.\n\n Args:\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n \"\"\"\n if not storage_parameters:\n self._fs = fsspec.filesystem(\"file\")\n self._storage_base_path = (\n f\"file://{self._cache_location['batch_input_data']}\"\n )\n return\n\n if \"path\" not in storage_parameters:\n raise DistilabelUserError(\n \"The 'path' key must be present in the `storage_parameters` dictionary\"\n \" if it's not `None`.\",\n page=\"sections/how_to_guides/advanced/fs_to_pass_data/\",\n )\n\n path = storage_parameters.pop(\"path\")\n protocol = UPath(path).protocol\n\n self._fs = fsspec.filesystem(protocol, **storage_parameters)\n self._storage_base_path = path\n\n def _add_step(self, step: \"_Step\") -> None:\n \"\"\"Add a step to the pipeline.\n\n Args:\n step: The step to be added to the pipeline.\n \"\"\"\n self.dag.add_step(step)\n\n def _add_edge(self, from_step: str, to_step: str) -> None:\n \"\"\"Add an edge between two steps in the pipeline.\n\n Args:\n from_step: The name of the step that will generate the input for `to_step`.\n to_step: The name of the step that will receive the input from `from_step`.\n \"\"\"\n self.dag.add_edge(from_step, to_step)\n\n # Check if `from_step` has a `routing_batch_function`. If it does, then mark\n # `to_step` as a step that will receive a routed batch.\n node = self.dag.get_step(from_step) # type: ignore\n routing_batch_function = node.get(\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME, None\n )\n self.dag.set_step_attr(\n name=to_step,\n attr=constants.RECEIVES_ROUTED_BATCHES_ATTR_NAME,\n value=routing_batch_function is not None,\n )\n\n def _is_convergence_step(self, step_name: str) -> None:\n \"\"\"Checks if a step is a convergence step.\n\n Args:\n step_name: The name of the step.\n \"\"\"\n return self.dag.get_step(step_name).get(constants.CONVERGENCE_STEP_ATTR_NAME)\n\n def _add_routing_batch_function(\n self, step_name: str, routing_batch_function: \"RoutingBatchFunction\"\n ) -> None:\n \"\"\"Add a routing batch function to a step.\n\n Args:\n step_name: The name of the step that will receive the routed batch.\n routing_batch_function: The function that will route the batch to the step.\n \"\"\"\n self.dag.set_step_attr(\n name=step_name,\n attr=constants.ROUTING_BATCH_FUNCTION_ATTR_NAME,\n value=routing_batch_function,\n )\n\n def _set_runtime_parameters(self, parameters: Dict[str, Dict[str, Any]]) -> None:\n \"\"\"Set the runtime parameters for the steps in the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the parameter name as the key and the parameter value as the value.\n \"\"\"\n step_names = set(self.dag.G)\n for step_name, step_parameters in parameters.items():\n if step_name not in step_names:\n self._logger.warning(\n f\"\u2753 Step '{step_name}' provided in `Pipeline.run(parameters={{...}})` not found in the pipeline.\"\n f\" Available steps are: {step_names}.\"\n )\n else:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n step.set_runtime_parameters(step_parameters)\n\n def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the DAG content to a dict.\n\n Args:\n obj (Any): Unused, just kept to match the signature of the parent method.\n kwargs (Any): Unused, just kept to match the signature of the parent method.\n\n Returns:\n Dict[str, Any]: Internal representation of the DAG from networkx in a serializable format.\n \"\"\"\n return self.dag.dump()\n\n def draw(\n self,\n path: Optional[Union[str, Path]] = \"pipeline.png\",\n top_to_bottom: bool = False,\n show_edge_labels: bool = True,\n ) -> str:\n \"\"\"\n Draws the pipeline.\n\n Parameters:\n path: The path to save the image to.\n top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n Returns:\n The path to the saved image.\n \"\"\"\n png = self.dag.draw(\n top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n )\n with open(path, \"wb\") as f:\n f.write(png)\n return path\n\n def __repr__(self) -> str:\n \"\"\"\n If running in a Jupyter notebook, display an image representing this `Pipeline`.\n \"\"\"\n if in_notebook():\n try:\n from IPython.display import Image, display\n\n image_data = self.dag.draw()\n\n display(Image(image_data))\n except Exception:\n pass\n return super().__repr__()\n\n def dump(self, **kwargs: Any) -> Dict[str, Any]:\n return {\n \"distilabel\": {\"version\": __version__},\n \"pipeline\": {\n \"name\": self.name,\n \"description\": self.description,\n **super().dump(),\n },\n \"requirements\": self.requirements,\n }\n\n @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n Note:\n It's intended for internal use.\n\n Args:\n data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n Returns:\n BasePipeline: Pipeline recreated from the dictionary info.\n \"\"\"\n name = data[\"pipeline\"][\"name\"]\n description = data[\"pipeline\"].get(\"description\")\n requirements = data.get(\"requirements\", [])\n with cls(name=name, description=description, requirements=requirements) as pipe:\n pipe.dag = DAG.from_dict(data[\"pipeline\"])\n return pipe\n\n @property\n def _cache_location(self) -> \"_CacheLocation\":\n \"\"\"Dictionary containing the object that will stored and the location,\n whether it is a filename or a folder.\n\n Returns:\n Path: Filenames where the pipeline content will be serialized.\n \"\"\"\n folder = self._cache_dir / self.name / self.signature\n pipeline_execution_dir = folder / \"executions\" / self.aggregated_steps_signature\n return {\n \"pipeline\": pipeline_execution_dir / \"pipeline.yaml\",\n \"batch_manager\": pipeline_execution_dir / \"batch_manager.json\",\n \"steps_data\": self._cache_dir / self.name / \"steps_data\",\n \"data\": pipeline_execution_dir / \"data\",\n \"batch_input_data\": pipeline_execution_dir / \"batch_input_data\",\n \"log_file\": pipeline_execution_dir / \"pipeline.log\",\n \"stages_file\": pipeline_execution_dir / \"stages.json\",\n }\n\n @property\n def aggregated_steps_signature(self) -> str:\n \"\"\"Creates an aggregated signature using `Step`s signature that will be used for\n the `_BatchManager`.\n\n Returns:\n The aggregated signature.\n \"\"\"\n signatures = []\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n signatures.append(step.signature)\n return hashlib.sha1(\"\".join(signatures).encode()).hexdigest()\n\n def _cache(self) -> None:\n \"\"\"Saves the `BasePipeline` using the `_cache_filename`.\"\"\"\n if self._dry_run:\n return\n\n self.save(\n path=self._cache_location[\"pipeline\"],\n format=self._cache_location[\"pipeline\"].suffix.replace(\".\", \"\"), # type: ignore\n )\n\n if self._batch_manager is not None:\n self._batch_manager.cache(\n path=self._cache_location[\"batch_manager\"],\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n\n self._save_stages_status()\n\n self._logger.debug(\"Pipeline and batch manager saved to cache.\")\n\n def _save_stages_status(self) -> None:\n \"\"\"Saves the stages status to cache.\"\"\"\n self.save(\n path=self._cache_location[\"stages_file\"],\n format=\"json\",\n dump={\n \"current_stage\": self._current_stage,\n \"stages_last_batch\": self._stages_last_batch,\n },\n )\n\n def _get_steps_load_stages(self) -> Tuple[List[List[str]], List[List[str]]]:\n return self.dag.get_steps_load_stages(self._load_groups)\n\n def _load_stages_status(self, use_cache: bool = True) -> None:\n \"\"\"Try to load the stages status from cache, or initialize it if cache file doesn't\n exist or cache is not going to be used.\"\"\"\n if use_cache and self._cache_location[\"stages_file\"].exists():\n stages_status = read_json(self._cache_location[\"stages_file\"])\n self._current_stage = stages_status[\"current_stage\"]\n self._stages_last_batch = stages_status[\"stages_last_batch\"]\n else:\n self._current_stage = 0\n self._stages_last_batch = [\n [] for _ in range(len(self._get_steps_load_stages()[0]))\n ]\n\n def _refresh_pipeline_from_cache(self) -> None:\n \"\"\"Refresh the DAG (and its steps) from the cache file. This is useful as some\n `Step`s can update and change their state during the pipeline execution, and this\n method will make sure the pipeline is up-to-date with the latest changes when\n the pipeline is reloaded from cache.\n \"\"\"\n\n def recursively_handle_secrets_and_excluded_attributes(\n cached_model: \"BaseModel\", model: \"BaseModel\"\n ) -> None:\n \"\"\"Recursively handle the secrets and excluded attributes of a `BaseModel`,\n setting the values of the cached model to the values of the model.\n\n Args:\n cached_model: The cached model that will be updated as it doesn't contain\n the secrets and excluded attributes (not serialized).\n model: The model that contains the secrets and excluded attributes because\n it comes from pipeline instantiation.\n \"\"\"\n for field_name, field_info in cached_model.model_fields.items():\n if field_name in (\"pipeline\"):\n continue\n\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type) or field_info.exclude:\n setattr(cached_model, field_name, getattr(model, field_name))\n elif isclass(inner_type) and issubclass(inner_type, BaseModel):\n recursively_handle_secrets_and_excluded_attributes(\n getattr(cached_model, field_name),\n getattr(model, field_name),\n )\n\n if self._cache_location[\"pipeline\"].exists():\n cached_dag = self.from_yaml(self._cache_location[\"pipeline\"]).dag\n\n for step_name in cached_dag:\n step_cached: \"_Step\" = cached_dag.get_step(step_name)[\n constants.STEP_ATTR_NAME\n ]\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n recursively_handle_secrets_and_excluded_attributes(step_cached, step)\n\n self.dag = cached_dag\n\n def _load_batch_manager(self, use_cache: bool = True) -> None:\n \"\"\"Will try to load the `_BatchManager` from the cache dir if found. Otherwise,\n it will create one from scratch.\n\n If the `_BatchManager` is loaded from cache, we check for invalid steps (those that\n may have a different signature than the original in the pipeline folder), and\n restart them, as well as their successors.\n\n Args:\n use_cache: whether the cache should be used or not.\n \"\"\"\n batch_manager_cache_loc = self._cache_location[\"batch_manager\"]\n\n # This first condition handles the case in which the pipeline is exactly the same\n # no steps have been added, removed or changed.\n if use_cache and batch_manager_cache_loc.exists():\n self._logger.info(\n f\"\ud83d\udcbe Loading `_BatchManager` from cache: '{batch_manager_cache_loc}'\"\n )\n self._batch_manager = _BatchManager.load_from_cache(\n dag=self.dag,\n batch_manager_path=batch_manager_cache_loc,\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n self._invalidate_steps_cache_if_required()\n # In this other case, the pipeline has been changed. We need to create a new batch\n # manager and if `use_cache==True` then check which outputs have we computed and\n # cached for steps that haven't changed but that were executed in another pipeline,\n # and therefore we can reuse\n else:\n self._batch_manager = _BatchManager.from_dag(\n dag=self.dag,\n use_cache=use_cache,\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n\n def _invalidate_steps_cache_if_required(self) -> None:\n \"\"\"Iterates over the steps of the pipeline and invalidates their cache if required.\"\"\"\n for step_name in self.dag:\n # `GeneratorStep`s doesn't receive input data so no need to check their\n # `_BatchManagerStep`\n if self.dag.get_step(step_name)[constants.STEP_ATTR_NAME].is_generator:\n continue\n\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if not step.use_cache:\n self._batch_manager.invalidate_cache_for(\n step_name=step.name,\n dag=self.dag,\n steps_data_path=self._cache_location[\"steps_data\"],\n ) # type: ignore\n self._logger.info(\n f\"\u267b\ufe0f Step '{step.name}' won't use cache (`use_cache=False`). The cache of this step and their successors won't be\"\n \" reused and the results will have to be recomputed.\"\n )\n break\n\n def _setup_write_buffer(self, use_cache: bool = True) -> None:\n \"\"\"Setups the `_WriteBuffer` that will store the data of the leaf steps of the\n pipeline while running, so the `Distiset` can be created at the end.\n \"\"\"\n if not use_cache and self._cache_location[\"data\"].exists():\n shutil.rmtree(self._cache_location[\"data\"])\n buffer_data_path = self._cache_location[\"data\"] / constants.STEPS_OUTPUTS_PATH\n self._logger.info(f\"\ud83d\udcdd Pipeline data will be written to '{buffer_data_path}'\")\n self._write_buffer = _WriteBuffer(\n buffer_data_path,\n self.dag.leaf_steps,\n steps_cached={\n step_name: self.dag.get_step(step_name)[\n constants.STEP_ATTR_NAME\n ].use_cache\n for step_name in self.dag\n },\n )\n\n def _print_load_stages_info(self) -> None:\n \"\"\"Prints the information about the load stages.\"\"\"\n stages, _ = self._get_steps_load_stages()\n msg = \"\"\n for stage, steps in enumerate(stages):\n steps_to_be_loaded = self._steps_to_be_loaded_in_stage(stage)\n msg += f\"\\n * Stage {stage}:\"\n for step_name in steps:\n step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if step.is_generator:\n emoji = \"\ud83d\udeb0\"\n elif step.is_global:\n emoji = \"\ud83c\udf10\"\n else:\n emoji = \"\ud83d\udd04\"\n msg += f\"\\n - {emoji} '{step_name}'\"\n if step_name not in steps_to_be_loaded:\n msg += \" (results cached, won't be loaded and executed)\"\n legend = \"\\n * Legend: \ud83d\udeb0 GeneratorStep \ud83c\udf10 GlobalStep \ud83d\udd04 Step\"\n self._logger.info(\n f\"\u231b The steps of the pipeline will be loaded in stages:{legend}{msg}\"\n )\n\n def _run_output_queue_loop_in_thread(self) -> threading.Thread:\n \"\"\"Runs the output queue loop in a separate thread to receive the output batches\n from the steps. This is done to avoid the signal handler to block the loop, which\n would prevent the pipeline from stopping correctly.\"\"\"\n thread = threading.Thread(target=self._output_queue_loop)\n thread.start()\n return thread\n\n def _output_queue_loop(self) -> None:\n \"\"\"Loop to receive the output batches from the steps and manage the flow of the\n batches through the pipeline.\"\"\"\n self._create_steps_input_queues()\n\n if not self._initialize_pipeline_execution():\n return\n\n while self._should_continue_processing(): # type: ignore\n self._logger.debug(\"Waiting for output batch from step...\")\n if (batch := self._output_queue.get()) is None:\n self._logger.debug(\"Received `None` from output queue. Breaking loop.\")\n break\n\n self._logger.debug(\n f\"Received batch with seq_no {batch.seq_no} from step '{batch.step_name}'\"\n f\" from output queue: {batch}\"\n )\n\n self._process_batch(batch)\n\n # If `_stop_called` was set to `True` while waiting for the output queue, then\n # we need to handle the stop of the pipeline and break the loop to avoid\n # propagating the batches through the pipeline and making the stop process\n # slower.\n with self._stop_called_lock:\n if self._stop_called:\n self._handle_batch_on_stop(batch)\n break\n\n # If there is another load stage and all the `last_batch`es from the stage\n # have been received, then load the next stage.\n if self._should_load_next_stage():\n self._wait_current_stage_to_finish()\n if not self._update_stage():\n break\n\n self._manage_batch_flow(batch)\n\n self._finalize_pipeline_execution()\n\n def _create_steps_input_queues(self) -> None:\n \"\"\"Creates the input queue for all the steps in the pipeline.\"\"\"\n for step_name in self.dag:\n self._logger.debug(f\"Creating input queue for '{step_name}' step...\")\n input_queue = self._create_step_input_queue(step_name)\n self._steps_input_queues[step_name] = input_queue\n\n def _initialize_pipeline_execution(self) -> bool:\n \"\"\"Load the steps of the required stage to initialize the pipeline execution,\n and requests the initial batches to trigger the batch flowing in the pipeline.\n\n Returns:\n `True` if initialization went OK, `False` otherwise.\n \"\"\"\n # Wait for all the steps to be loaded correctly\n if not self._run_stage_steps_and_wait(stage=self._current_stage):\n self._set_steps_not_loaded_exception()\n return False\n\n # Send the \"first\" batches to the steps so the batches starts flowing through\n # the input queues and output queue\n self._request_initial_batches()\n\n return True\n\n def _should_continue_processing(self) -> bool:\n \"\"\"Condition for the consume batches from the `output_queue` loop.\n\n Returns:\n `True` if should continue consuming batches, `False` otherwise and the pipeline\n should stop.\n \"\"\"\n with self._stop_called_lock:\n return self._batch_manager.can_generate() and not self._stop_called # type: ignore\n\n def _process_batch(\n self, batch: \"_Batch\", send_last_batch_flag: bool = True\n ) -> None:\n \"\"\"Process a batch consumed from the `output_queue`.\n\n Args:\n batch: the batch to be processed.\n \"\"\"\n if batch.data_path:\n self._logger.debug(\n f\"Reading {batch.seq_no} batch data from '{batch.step_name}': '{batch.data_path}'\"\n )\n batch.read_batch_data_from_fs()\n\n if batch.step_name in self.dag.leaf_steps:\n self._write_buffer.add_batch(batch) # type: ignore\n\n if batch.last_batch:\n self._register_stages_last_batch(batch)\n\n # Make sure to send the `LAST_BATCH_SENT_FLAG` to the predecessors of the step\n # if the batch is the last one, so they stop their processing loop even if they\n # haven't received the last batch because of the routing function.\n if send_last_batch_flag:\n for step_name in self.dag.get_step_predecessors(batch.step_name):\n if self._is_step_running(step_name):\n self._send_last_batch_flag_to_step(step_name)\n\n def _set_step_for_recovering_offline_batch_generation(\n self, step: \"_Step\", data: List[List[Dict[str, Any]]]\n ) -> None:\n \"\"\"Sets the required information to recover a pipeline execution from a `_Step`\n that used an `LLM` with offline batch generation.\n\n Args:\n step: The `_Step` that used an `LLM` with offline batch generation.\n data: The data that was used to generate the batches for the step.\n \"\"\"\n # Replace step so the attribute `jobs_ids` of the `LLM` is not lost, as it was\n # updated in the child process but not in the main process.\n step_name: str = step.name # type: ignore\n self.dag.set_step_attr(\n name=step_name, attr=constants.STEP_ATTR_NAME, value=step\n )\n self._recover_offline_batch_generate_for_step = (step_name, data)\n\n def _add_batch_for_recovering_offline_batch_generation(self) -> None:\n \"\"\"Adds a dummy `_Batch` to the specified step name (it's a `Task` that used an\n `LLM` with offline batch generation) to recover the pipeline state for offline\n batch generation in next pipeline executions.\"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n if self._recover_offline_batch_generate_for_step is None:\n return\n\n step_name, data = self._recover_offline_batch_generate_for_step\n self._logger.debug(\n f\"Adding batch to '{step_name}' step to recover pipeline execution for offline\"\n \" batch generation...\"\n )\n self._batch_manager.add_batch_to_recover_offline_batch_generation(\n to_step=step_name,\n data=data,\n )\n\n def _register_stages_last_batch(self, batch: \"_Batch\") -> None:\n \"\"\"Registers the last batch received from a step in the `_stages_last_batch`\n dictionary.\n\n Args:\n batch: The last batch received from a step.\n \"\"\"\n _, stages_last_steps = self._get_steps_load_stages()\n stage_last_steps = stages_last_steps[self._current_stage]\n if batch.step_name in stage_last_steps:\n self._stages_last_batch[self._current_stage].append(batch.step_name)\n self._stages_last_batch[self._current_stage].sort()\n\n def _update_stage(self) -> bool:\n \"\"\"Checks if the steps of next stage should be loaded and updates `_current_stage`\n attribute.\n\n Returns:\n `True` if updating the stage went OK, `False` otherwise.\n \"\"\"\n self._current_stage += 1\n if not self._run_stage_steps_and_wait(stage=self._current_stage):\n self._set_steps_not_loaded_exception()\n return False\n\n return True\n\n def _should_load_next_stage(self) -> bool:\n \"\"\"Returns if the next stage should be loaded.\n\n Returns:\n `True` if the next stage should be loaded, `False` otherwise.\n \"\"\"\n _, stage_last_steps = self._get_steps_load_stages()\n there_is_next_stage = self._current_stage + 1 < len(stage_last_steps)\n stage_last_batches_received = (\n self._stages_last_batch[self._current_stage]\n == stage_last_steps[self._current_stage]\n )\n return there_is_next_stage and stage_last_batches_received\n\n def _finalize_pipeline_execution(self) -> None:\n \"\"\"Finalizes the pipeline execution handling the prematurely stop of the pipeline\n if required, caching the data and ensuring that all the steps finish its execution.\"\"\"\n\n # Send `None` to steps `input_queue`s just in case some step is still waiting\n self._notify_steps_to_stop()\n\n for step_name in self.dag:\n while self._is_step_running(step_name):\n self._logger.debug(f\"Waiting for step '{step_name}' to finish...\")\n time.sleep(0.5)\n\n with self._stop_called_lock:\n if self._stop_called:\n self._handle_stop()\n\n # Reset flag state\n self._stop_called = False\n\n self._add_batch_for_recovering_offline_batch_generation()\n\n self._cache()\n\n def _run_load_queue_loop_in_thread(self) -> threading.Thread:\n \"\"\"Runs a background thread that reads from the `load_queue` to update the status\n of the number of replicas loaded for each step.\n\n Returns:\n The thread that was started.\n \"\"\"\n thread = threading.Thread(target=self._run_load_queue_loop)\n thread.start()\n return thread\n\n def _run_load_queue_loop(self) -> None:\n \"\"\"Runs a loop that reads from the `load_queue` to update the status of the number\n of replicas loaded for each step.\"\"\"\n\n while True:\n if (load_info := self._load_queue.get()) is None:\n self._logger.debug(\"Received `None` from load queue. Breaking loop.\")\n break\n\n with self._steps_load_status_lock:\n step_name, status = load_info[\"name\"], load_info[\"status\"]\n if status == \"loaded\":\n if self._steps_load_status[step_name] == _STEP_NOT_LOADED_CODE:\n self._steps_load_status[step_name] = 1\n else:\n self._steps_load_status[step_name] += 1\n elif status == \"unloaded\":\n self._steps_load_status[step_name] -= 1\n if self._steps_load_status[step_name] == 0:\n self._steps_load_status[step_name] = _STEP_UNLOADED_CODE\n else:\n # load failed\n self._steps_load_status[step_name] = _STEP_LOAD_FAILED_CODE\n\n self._logger.debug(\n f\"Step '{step_name}' loaded replicas: {self._steps_load_status[step_name]}\"\n )\n\n def _is_step_running(self, step_name: str) -> bool:\n \"\"\"Checks if the step is running (at least one replica is running).\n\n Args:\n step_name: The step to be check if running.\n\n Returns:\n `True` if the step is running, `False` otherwise.\n \"\"\"\n with self._steps_load_status_lock:\n return self._steps_load_status[step_name] >= 1\n\n def _steps_to_be_loaded_in_stage(self, stage: int) -> List[str]:\n \"\"\"Returns the list of steps of the provided stage that should be loaded taking\n into account if they have finished.\n\n Args:\n stage: the stage number\n\n Returns:\n A list containing the name of the steps that should be loaded in this stage.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n steps_stages, _ = self._get_steps_load_stages()\n\n return [\n step\n for step in steps_stages[stage]\n if not self._batch_manager.step_has_finished(step)\n ]\n\n def _get_steps_load_status(self, steps: List[str]) -> Dict[str, int]:\n \"\"\"Gets the a dictionary containing the load status of the provided steps.\n\n Args:\n steps: a list containing the names of the steps to get their load status.\n\n Returns:\n A dictionary containing the load status of the provided steps.\n \"\"\"\n return {\n step_name: replicas\n for step_name, replicas in self._steps_load_status.items()\n if step_name in steps\n }\n\n def _wait_current_stage_to_finish(self) -> None:\n \"\"\"Waits for the current stage to finish.\"\"\"\n stage = self._current_stage\n steps = self._steps_to_be_loaded_in_stage(stage)\n self._logger.info(f\"\u23f3 Waiting for stage {stage} to finish...\")\n with self._stop_called_lock:\n while not self._stop_called:\n filtered_steps_load_status = self._get_steps_load_status(steps)\n if all(\n replicas == _STEP_UNLOADED_CODE\n for replicas in filtered_steps_load_status.values()\n ):\n self._logger.info(f\"\u2705 Stage {stage} has finished!\")\n break\n\n def _run_stage_steps_and_wait(self, stage: int) -> bool:\n \"\"\"Runs the steps of the specified stage and waits for them to be ready.\n\n Args:\n stage: the stage from which the steps have to be loaded.\n\n Returns:\n `True` if all the steps have been loaded correctly, `False` otherwise.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n steps = self._steps_to_be_loaded_in_stage(stage)\n self._logger.debug(f\"Steps to be loaded in stage {stage}: {steps}\")\n\n # Run the steps of the stage\n self._run_steps(steps=steps)\n\n # Wait for them to be ready\n self._logger.info(f\"\u23f3 Waiting for all the steps of stage {stage} to load...\")\n previous_message = None\n with self._stop_called_lock:\n while not self._stop_called:\n with self._steps_load_status_lock:\n filtered_steps_load_status = self._get_steps_load_status(steps)\n self._logger.debug(\n f\"Steps from stage {stage} loaded: {filtered_steps_load_status}\"\n )\n\n if any(\n replicas_loaded == _STEP_LOAD_FAILED_CODE\n for replicas_loaded in filtered_steps_load_status.values()\n ):\n self._logger.error(\n f\"\u274c Failed to load all the steps of stage {stage}\"\n )\n return False\n\n num_steps_loaded = 0\n replicas_message = \"\"\n for step_name, replicas in filtered_steps_load_status.items():\n step_replica_count = self.dag.get_step_replica_count(step_name)\n # It can happen that the step is very fast and it has done all the\n # work and have finished its execution before checking if it has\n # been loaded, that's why we also considered the step to be loaded\n # if `_STEP_UNLOADED_CODE`.\n if (\n replicas == step_replica_count\n or replicas == _STEP_UNLOADED_CODE\n ):\n num_steps_loaded += 1\n replicas_message += f\"\\n * '{step_name}' replicas: {max(0, replicas)}/{step_replica_count}\"\n\n message = f\"\u23f3 Steps from stage {stage} loaded: {num_steps_loaded}/{len(filtered_steps_load_status)}{replicas_message}\"\n if num_steps_loaded > 0 and message != previous_message:\n self._logger.info(message)\n previous_message = message\n\n if num_steps_loaded == len(filtered_steps_load_status):\n self._logger.info(\n f\"\u2705 All the steps from stage {stage} have been loaded!\"\n )\n return True\n\n time.sleep(2.5)\n\n return not self._stop_called\n\n def _handle_stop(self) -> None:\n \"\"\"Handles the stop of the pipeline execution, which will stop the steps from\n processing more batches and wait for the output queue to be empty, to not lose\n any data that was already processed by the steps before the stop was called.\"\"\"\n self._logger.debug(\"Handling stop of the pipeline execution...\")\n\n self._add_batches_back_to_batch_manager()\n\n # Wait for the input queue to be empty, which means that all the steps finished\n # processing the batches that were sent before the stop flag.\n self._wait_steps_input_queues_empty()\n\n self._consume_output_queue()\n\n if self._should_load_next_stage():\n self._current_stage += 1\n\n def _wait_steps_input_queues_empty(self) -> None:\n self._logger.debug(\"Waiting for steps input queues to be empty...\")\n for step_name in self.dag:\n self._wait_step_input_queue_empty(step_name)\n self._logger.debug(\"Steps input queues are empty!\")\n\n def _wait_step_input_queue_empty(self, step_name: str) -> Union[\"Queue[Any]\", None]:\n \"\"\"Waits for the input queue of a step to be empty.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n The input queue of the step if it's not loaded or finished, `None` otherwise.\n \"\"\"\n if self._check_step_not_loaded_or_finished(step_name):\n return None\n\n if input_queue := self.dag.get_step(step_name).get(\n constants.INPUT_QUEUE_ATTR_NAME\n ):\n while input_queue.qsize() != 0:\n pass\n return input_queue\n\n def _check_step_not_loaded_or_finished(self, step_name: str) -> bool:\n \"\"\"Checks if a step is not loaded or already finished.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n `True` if the step is not loaded or already finished, `False` otherwise.\n \"\"\"\n with self._steps_load_status_lock:\n num_replicas = self._steps_load_status[step_name]\n\n # The step has finished (replicas = 0) or it has failed to load\n if num_replicas in [0, _STEP_LOAD_FAILED_CODE, _STEP_UNLOADED_CODE]:\n return True\n\n return False\n\n @property\n @abstractmethod\n def QueueClass(self) -> Callable:\n \"\"\"The class of the queue to use in the pipeline.\"\"\"\n pass\n\n def _create_step_input_queue(self, step_name: str) -> \"Queue[Any]\":\n \"\"\"Creates an input queue for a step.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n The input queue created.\n \"\"\"\n input_queue = self.QueueClass()\n self.dag.set_step_attr(step_name, constants.INPUT_QUEUE_ATTR_NAME, input_queue)\n return input_queue\n\n @abstractmethod\n def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n \"\"\"Runs the `Step` instance.\n\n Args:\n step: The `Step` instance to run.\n input_queue: The input queue where the step will receive the batches.\n replica: The replica ID assigned.\n \"\"\"\n pass\n\n def _run_steps(self, steps: Iterable[str]) -> None:\n \"\"\"Runs the `Step`s of the pipeline, creating first an input queue for each step\n that will be used to send the batches.\n\n Args:\n steps:\n \"\"\"\n for step_name in steps:\n step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n input_queue = self._steps_input_queues[step.name] # type: ignore\n\n # Set `pipeline` to `None` as in some Python environments the pipeline is not\n # picklable and it will raise an error when trying to send the step to the process.\n # `TypeError: cannot pickle 'code' object`\n step.pipeline = None\n\n if not step.is_normal and step.resources.replicas > 1: # type: ignore\n self._logger.warning(\n f\"Step '{step_name}' is a `GeneratorStep` or `GlobalStep` and has more\"\n \" than 1 replica. Only `Step` instances can have more than 1 replica.\"\n \" The number of replicas for the step will be set to 1.\"\n )\n\n step_num_replicas: int = step.resources.replicas if step.is_normal else 1 # type: ignore\n for replica in range(step_num_replicas):\n self._logger.debug(\n f\"Running 1 replica of step '{step.name}' with ID {replica}...\"\n )\n self._run_step(\n step=step.model_copy(deep=True),\n input_queue=input_queue,\n replica=replica,\n )\n\n def _add_batches_back_to_batch_manager(self) -> None:\n \"\"\"Add the `Batch`es that were sent to a `Step` back to the `_BatchManager`. This\n method should be used when the pipeline has been stopped prematurely.\"\"\"\n self._logger.debug(\n \"Adding batches from step input queues back to the batch manager...\"\n )\n for step_name in self.dag:\n node = self.dag.get_step(step_name)\n step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n if step.is_generator:\n continue\n if input_queue := node.get(constants.INPUT_QUEUE_ATTR_NAME):\n while not input_queue.empty():\n batch = input_queue.get()\n if not isinstance(batch, _Batch):\n continue\n self._batch_manager.add_batch( # type: ignore\n to_step=step_name,\n batch=batch,\n prepend=True,\n )\n self._logger.debug(\n f\"Adding batch back to the batch manager: {batch}\"\n )\n if self._check_step_not_loaded_or_finished(step_name):\n # Notify the step to stop\n input_queue.put(None)\n self._logger.debug(\"Finished adding batches back to the batch manager.\")\n\n def _consume_output_queue(self) -> None:\n \"\"\"Consumes the `Batch`es from the output queue until it's empty. This method should\n be used when the pipeline has been stopped prematurely to consume and to not lose\n the `Batch`es that were processed by the leaf `Step`s before stopping the pipeline.\"\"\"\n while not self._output_queue.empty():\n batch = self._output_queue.get()\n if batch is None:\n continue\n self._process_batch(batch, send_last_batch_flag=False)\n self._handle_batch_on_stop(batch)\n\n def _manage_batch_flow(self, batch: \"_Batch\") -> None:\n \"\"\"Checks if the step that generated the batch has more data in its buffer to\n generate a new batch. If there's data, then a new batch is sent to the step. If\n the step has no data in its buffer, then the predecessors generator steps are\n requested to send a new batch.\n\n Args:\n batch: The batch that was processed.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n route_to, do_not_route_to, routed = self._get_successors(batch)\n\n self._register_batch(batch)\n\n # Keep track of the steps that the batch was routed to\n if routed:\n batch.batch_routed_to = route_to\n\n self._set_next_expected_seq_no(\n steps=do_not_route_to,\n from_step=batch.step_name,\n next_expected_seq_no=batch.seq_no + 1,\n )\n\n step = self._get_step_from_batch(batch)\n\n # Add the batch to the successors input buffers\n for successor in route_to:\n # Copy batch to avoid modifying the same reference in the batch manager\n batch_to_add = batch.copy() if len(route_to) > 1 else batch\n\n self._batch_manager.add_batch(successor, batch_to_add)\n\n # Check if the step is a generator and if there are successors that need data\n # from this step. This usually happens when the generator `batch_size` is smaller\n # than the `input_batch_size` of the successor steps.\n if (\n step.is_generator\n and step.name in self._batch_manager.step_empty_buffers(successor)\n ):\n last_batch_sent = self._batch_manager.get_last_batch_sent(step.name)\n self._send_batch_to_step(last_batch_sent.next_batch()) # type: ignore\n\n # If successor step has enough data in its buffer to create a new batch, then\n # send the batch to the step.\n while new_batch := self._batch_manager.get_batch(successor):\n self._send_batch_to_step(new_batch)\n\n if not step.is_generator:\n # Step (\"this\", the one from which the batch was received) has enough data on its\n # buffers to create a new batch\n while new_batch := self._batch_manager.get_batch(step.name): # type: ignore\n self._send_batch_to_step(new_batch)\n else:\n self._request_more_batches_if_needed(step)\n else:\n # Case in which the pipeline only contains a `GeneratorStep` so we constanly keep\n # requesting batch after batch as there is no downstream step to consume it\n if len(self.dag) == 1:\n self._request_batch_from_generator(step.name) # type: ignore\n\n self._cache()\n\n def _send_to_step(self, step_name: str, to_send: Any) -> None:\n \"\"\"Sends something to the input queue of a step.\n\n Args:\n step_name: The name of the step.\n to_send: The object to send.\n \"\"\"\n input_queue = self.dag.get_step(step_name)[constants.INPUT_QUEUE_ATTR_NAME]\n input_queue.put(to_send)\n\n def _send_batch_to_step(self, batch: \"_Batch\") -> None:\n \"\"\"Sends a batch to the input queue of a step, writing the data of the batch\n to the filesystem and setting `batch.data_path` with the path where the data\n was written (if requiered i.e. the step is a global step or `use_fs_to_pass_data`)\n\n This method should be extended by the specific pipeline implementation, adding\n the logic to send the batch to the step.\n\n Args:\n batch: The batch to send.\n \"\"\"\n self._logger.debug(\n f\"Setting batch {batch.seq_no} as last batch sent to '{batch.step_name}': {batch}\"\n )\n self._batch_manager.set_last_batch_sent(batch) # type: ignore\n\n step: \"_Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n if not step.is_generator and (step.is_global or self._use_fs_to_pass_data):\n base_path = UPath(self._storage_base_path) / step.name # type: ignore\n self._logger.debug(\n f\"Writing {batch.seq_no} batch for '{batch.step_name}' step to filesystem: {base_path}\"\n )\n batch.write_batch_data_to_fs(self._fs, base_path) # type: ignore\n\n self._logger.debug(\n f\"Sending batch {batch.seq_no} to step '{batch.step_name}': {batch}\"\n )\n self._send_to_step(batch.step_name, batch)\n\n def _gather_requirements(self) -> List[str]:\n \"\"\"Extracts the requirements from the steps to be used in the pipeline.\n\n Returns:\n List of requirements gathered from the steps.\n \"\"\"\n steps_requirements = []\n for step in self.dag:\n step_req = self.dag.get_step(step)[constants.STEP_ATTR_NAME].requirements\n steps_requirements.extend(step_req)\n\n return steps_requirements\n\n def _register_batch(self, batch: \"_Batch\") -> None:\n \"\"\"Registers a batch in the batch manager.\n\n Args:\n batch: The batch to register.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n self._batch_manager.register_batch(\n batch, steps_data_path=self._cache_location[\"steps_data\"]\n ) # type: ignore\n self._logger.debug(\n f\"Batch {batch.seq_no} from step '{batch.step_name}' registered in batch\"\n \" manager\"\n )\n\n def _send_last_batch_flag_to_step(self, step_name: str) -> None:\n \"\"\"Sends the `LAST_BATCH_SENT_FLAG` to a step to stop processing batches.\n\n Args:\n step_name: The name of the step.\n \"\"\"\n self._logger.debug(\n f\"Sending `LAST_BATCH_SENT_FLAG` to '{step_name}' step to stop processing\"\n \" batches...\"\n )\n\n for _ in range(self.dag.get_step_replica_count(step_name)):\n self._send_to_step(step_name, constants.LAST_BATCH_SENT_FLAG)\n self._batch_manager.set_last_batch_flag_sent_to(step_name) # type: ignore\n\n def _request_initial_batches(self) -> None:\n \"\"\"Requests the initial batches to the generator steps.\"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n for step in self._batch_manager._steps.values():\n if not self._is_step_running(step.step_name):\n continue\n if batch := step.get_batch():\n self._logger.debug(\n f\"Sending initial batch to '{step.step_name}' step: {batch}\"\n )\n self._send_batch_to_step(batch)\n\n for step_name in self.dag.root_steps:\n if not self._is_step_running(step_name):\n continue\n seq_no = 0\n if last_batch := self._batch_manager.get_last_batch(step_name):\n seq_no = last_batch.seq_no + 1\n batch = _Batch(seq_no=seq_no, step_name=step_name, last_batch=self._dry_run)\n self._logger.debug(\n f\"Requesting initial batch to '{step_name}' generator step: {batch}\"\n )\n self._send_batch_to_step(batch)\n\n def _request_batch_from_generator(self, step_name: str) -> None:\n \"\"\"Request a new batch to a `GeneratorStep`.\n\n Args:\n step_name: the name of the `GeneratorStep` to which a batch has to be requested.\n \"\"\"\n # Get the last batch that the previous step sent to generate the next batch\n # (next `seq_no`).\n last_batch = self._batch_manager.get_last_batch_sent(step_name) # type: ignore\n if last_batch is None:\n return\n self._send_batch_to_step(last_batch.next_batch())\n\n def _request_more_batches_if_needed(self, step: \"Step\") -> None:\n \"\"\"Request more batches to the predecessors steps of `step` if needed.\n\n Args:\n step: The step of which it has to be checked if more batches are needed from\n its predecessors.\n \"\"\"\n empty_buffers = self._batch_manager.step_empty_buffers(step.name) # type: ignore\n for previous_step_name in empty_buffers:\n # Only more batches can be requested to the `GeneratorStep`s as they are the\n # only kind of steps that lazily generate batches.\n if previous_step_name not in self.dag.root_steps:\n continue\n\n self._request_batch_from_generator(previous_step_name)\n\n def _handle_batch_on_stop(self, batch: \"_Batch\") -> None:\n \"\"\"Handles a batch that was received from the output queue when the pipeline was\n stopped. It will add and register the batch in the batch manager.\n\n Args:\n batch: The batch to handle.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n self._batch_manager.register_batch(\n batch, steps_data_path=self._cache_location[\"steps_data\"]\n )\n step: \"Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n for successor in self.dag.get_step_successors(step.name): # type: ignore\n self._batch_manager.add_batch(successor, batch)\n\n def _get_step_from_batch(self, batch: \"_Batch\") -> \"Step\":\n \"\"\"Gets the `Step` instance from a batch.\n\n Args:\n batch: The batch to get the step from.\n\n Returns:\n The `Step` instance.\n \"\"\"\n return self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n\n def _notify_steps_to_stop(self) -> None:\n \"\"\"Notifies the steps to stop their infinite running loop by sending `None` to\n their input queues.\"\"\"\n with self._steps_load_status_lock:\n for step_name, replicas in self._steps_load_status.items():\n if replicas > 0:\n for _ in range(replicas):\n self._send_to_step(step_name, None)\n\n def _get_successors(self, batch: \"_Batch\") -> Tuple[List[str], List[str], bool]:\n \"\"\"Gets the successors and the successors to which the batch has to be routed.\n\n Args:\n batch: The batch to which the successors will be determined.\n\n Returns:\n The successors to route the batch to and whether the batch was routed using\n a routing function.\n \"\"\"\n node = self.dag.get_step(batch.step_name)\n step: \"Step\" = node[constants.STEP_ATTR_NAME]\n successors = list(self.dag.get_step_successors(step.name)) # type: ignore\n route_to = successors\n\n # Check if the step has a routing function to send the batch to specific steps\n if routing_batch_function := node.get(\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n ):\n route_to = routing_batch_function(batch, successors)\n successors_str = \", \".join(f\"'{successor}'\" for successor in route_to)\n self._logger.info(\n f\"\ud83d\ude8f Using '{step.name}' routing function to send batch {batch.seq_no} to steps: {successors_str}\"\n )\n\n return route_to, list(set(successors) - set(route_to)), route_to != successors\n\n def _set_next_expected_seq_no(\n self, steps: List[str], from_step: str, next_expected_seq_no: int\n ) -> None:\n \"\"\"Sets the next expected sequence number of a `_Batch` received by `step` from\n `from_step`. This is necessary as some `Step`s might not receive all the batches\n comming from the previous steps because there is a routing batch function.\n\n Args:\n steps: list of steps to which the next expected sequence number of a `_Batch`\n from `from_step` has to be updated in the `_BatchManager`.\n from_step: the name of the step from which the next expected sequence number\n of a `_Batch` has to be updated in `steps`.\n next_expected_seq_no: the number of the next expected sequence number of a `Batch`\n from `from_step`.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n for step in steps:\n self._batch_manager.set_next_expected_seq_no(\n step_name=step,\n from_step=from_step,\n next_expected_seq_no=next_expected_seq_no,\n )\n\n @abstractmethod\n def _teardown(self) -> None:\n \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n pass\n\n @abstractmethod\n def _set_steps_not_loaded_exception(self) -> None:\n \"\"\"Used to raise `RuntimeError` when the load of the steps failed.\n\n Raises:\n RuntimeError: containing the information and why a step failed to be loaded.\n \"\"\"\n pass\n\n @abstractmethod\n def _stop(self) -> None:\n \"\"\"Stops the pipeline in a controlled way.\"\"\"\n pass\n\n def _stop_load_queue_loop(self) -> None:\n \"\"\"Stops the `_load_queue` loop sending a `None`.\"\"\"\n self._logger.debug(\"Sending `None` to the load queue to notify stop...\")\n self._load_queue.put(None)\n\n def _stop_output_queue_loop(self) -> None:\n \"\"\"Stops the `_output_queue` loop sending a `None`.\"\"\"\n self._logger.debug(\"Sending `None` to the output queue to notify stop...\")\n self._output_queue.put(None)\n\n def _handle_keyboard_interrupt(self) -> Any:\n \"\"\"Handles KeyboardInterrupt signal sent during the Pipeline.run method.\n\n It will try to call self._stop (if the pipeline didn't started yet, it won't\n have any effect), and if the pool is already started, will close it before exiting\n the program.\n\n Returns:\n The original `signal.SIGINT` handler.\n \"\"\"\n\n def signal_handler(signumber: int, frame: Any) -> None:\n self._stop()\n\n return signal.signal(signal.SIGINT, signal_handler)\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.signature","title":"signature: str property ","text":"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them. The main use is to find the pipeline in the cache folder. Returns: Type Description str Signature of the pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.aggregated_steps_signature","title":"aggregated_steps_signature: str property ","text":"Creates an aggregated signature using Step s signature that will be used for the _BatchManager . Returns: Type Description str The aggregated signature. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.QueueClass","title":"QueueClass: Callable abstractmethod property ","text":"The class of the queue to use in the pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__init__","title":"__init__(name=None, description=None, cache_dir=None, enable_metadata=False, requirements=None) ","text":"Initialize the BasePipeline instance. Parameters: Name Type Description Default name Optional[str] The name of the pipeline. If not generated, a random one will be generated by default. None description Optional[str] A description of the pipeline. Defaults to None . None cache_dir Optional[Union[str, PathLike]] A directory where the pipeline will be cached. Defaults to None . None enable_metadata bool Whether to include the distilabel metadata column for the pipeline in the final Distiset . It contains metadata used by distilabel, for example the raw outputs of the LLM without processing would be here, inside raw_output_... field. Defaults to False . False requirements Optional[List[str]] List of requirements that must be installed to run the pipeline. Defaults to None , but can be helpful to inform in a pipeline to be shared that this requirements must be installed. None Source code in src/distilabel/pipeline/base.py def __init__(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n enable_metadata: bool = False,\n requirements: Optional[List[str]] = None,\n) -> None:\n \"\"\"Initialize the `BasePipeline` instance.\n\n Args:\n name: The name of the pipeline. If not generated, a random one will be generated by default.\n description: A description of the pipeline. Defaults to `None`.\n cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n enable_metadata: Whether to include the distilabel metadata column for the pipeline\n in the final `Distiset`. It contains metadata used by distilabel, for example\n the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n field. Defaults to `False`.\n requirements: List of requirements that must be installed to run the pipeline.\n Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n that this requirements must be installed.\n \"\"\"\n self.name = name or _PIPELINE_DEFAULT_NAME\n self.description = description\n self._enable_metadata = enable_metadata\n self.dag = DAG()\n\n if cache_dir:\n self._cache_dir = Path(cache_dir)\n elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n self._cache_dir = Path(env_cache_dir)\n else:\n self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n self._batch_manager: Optional[\"_BatchManager\"] = None\n self._write_buffer: Optional[\"_WriteBuffer\"] = None\n self._steps_input_queues: Dict[str, \"Queue\"] = {}\n\n self._steps_load_status: Dict[str, int] = {}\n self._steps_load_status_lock = threading.Lock()\n\n self._stop_called = False\n self._stop_called_lock = threading.Lock()\n self._stop_calls = 0\n\n self._recover_offline_batch_generate_for_step: Union[\n Tuple[str, List[List[Dict[str, Any]]]], None\n ] = None\n\n self._fs: Optional[fsspec.AbstractFileSystem] = None\n self._storage_base_path: Optional[str] = None\n self._use_fs_to_pass_data: bool = False\n self._dry_run = False\n\n self._current_stage = 0\n self._stages_last_batch: List[List[str]] = []\n self._load_groups = []\n\n self.requirements = requirements or []\n\n self._exception: Union[Exception, None] = None\n\n self._log_queue: Union[\"Queue[Any]\", None] = None\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__enter__","title":"__enter__() ","text":"Set the global pipeline instance when entering a pipeline context. Source code in src/distilabel/pipeline/base.py def __enter__(self) -> Self:\n \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(self)\n return self\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__exit__","title":"__exit__(exc_type, exc_value, traceback) ","text":"Unset the global pipeline instance when exiting a pipeline context. Source code in src/distilabel/pipeline/base.py def __exit__(self, exc_type, exc_value, traceback) -> None:\n \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(None)\n self._set_pipeline_name()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.run","title":"run(parameters=None, load_groups=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None) ","text":"Run the pipeline. It will set the runtime parameters for the steps and validate the pipeline. This method should be extended by the specific pipeline implementation, adding the logic to run the pipeline. Parameters: Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None load_groups Optional[LoadGroups] A list containing lists of steps that have to be loaded together and in isolation with respect to the rest of the steps of the pipeline. This argument also allows passing the following modes: - \"sequential_step_execution\": each step will be executed in a stage i.e. the execution of the steps will be sequential.
Defaults to None . None use_cache bool Whether to use the cache from previous pipeline runs. Defaults to True . True storage_parameters Optional[Dict[str, Any]] A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batch es passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None . None use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . False dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None dataset_batch_size int if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset . Defaults to 50 . 50 logging_handlers Optional[List[Handler]] A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None . None Returns: Type Description Distiset The Distiset created by the pipeline. Source code in src/distilabel/pipeline/base.py def run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[logging.Handler]] = None,\n) -> \"Distiset\": # type: ignore\n \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n the pipeline.\n\n This method should be extended by the specific pipeline implementation,\n adding the logic to run the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n \"\"\"\n\n self._exception: Union[Exception, None] = None\n\n # Set the runtime parameters that will be used during the pipeline execution.\n # They are used to generate the signature of the pipeline that is used to hit the\n # cache when the pipeline is run, so it's important to do it first.\n self._set_runtime_parameters(parameters or {})\n\n self._refresh_pipeline_from_cache()\n\n if dataset is not None:\n self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n setup_logging(\n log_queue=self._log_queue,\n filename=str(self._cache_location[\"log_file\"]),\n logging_handlers=logging_handlers,\n )\n\n # Set the name of the pipeline if it's the default one. This should be called\n # if the pipeline is defined within the context manager, and the run is called\n # outside of it. Is here in the following case:\n # with Pipeline() as pipeline:\n # pipeline.run()\n self._set_pipeline_name()\n\n # Validate the pipeline DAG to check that all the steps are chainable, there are\n # no missing runtime parameters, batch sizes are correct, load groups are valid,\n # etc.\n self._load_groups = self._built_load_groups(load_groups)\n self._validate()\n\n self._set_pipeline_artifacts_path_in_steps()\n\n # Set the initial load status for all the steps\n self._init_steps_load_status()\n\n # Load the stages status or initialize it\n self._load_stages_status(use_cache)\n\n # Load the `_BatchManager` from cache or create one from scratch\n self._load_batch_manager(use_cache)\n\n # Check pipeline requirements are installed\n self._check_requirements()\n\n # Setup the filesystem that will be used to pass the data of the `_Batch`es\n self._setup_fsspec(storage_parameters)\n self._use_fs_to_pass_data = use_fs_to_pass_data\n\n if self._dry_run:\n self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n # If the batch manager is not able to generate batches, that means that the loaded\n # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n # the previous pipeline execution was completed successfully.\n if not self._batch_manager.can_generate(): # type: ignore\n self._logger.info(\n \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n \" Returning `Distiset` from cache data...\"\n )\n distiset = create_distiset(\n data_dir=self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n stop_logging()\n return distiset\n\n self._setup_write_buffer(use_cache)\n\n self._print_load_stages_info()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.dry_run","title":"dry_run(parameters=None, batch_size=1, dataset=None) ","text":"Do a dry run to test the pipeline runs as expected. Running a Pipeline in dry run mode will set all the batch_size of generator steps to the specified batch_size , and run just with a single batch, effectively running the whole pipeline with a single example. The cache will be set to False . Parameters: Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None batch_size int The batch size of the unique batch generated by the generators steps of the pipeline. Defaults to 1 . 1 dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None Returns: Type Description Distiset Will return the Distiset as the main run method would do. Source code in src/distilabel/pipeline/base.py def dry_run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n batch_size: int = 1,\n dataset: Optional[\"InputDataset\"] = None,\n) -> \"Distiset\":\n \"\"\"Do a dry run to test the pipeline runs as expected.\n\n Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n to the specified `batch_size`, and run just with a single batch, effectively\n running the whole pipeline with a single example. The cache will be set to `False`.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n batch_size: The batch size of the unique batch generated by the generators\n steps of the pipeline. Defaults to `1`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n\n Returns:\n Will return the `Distiset` as the main run method would do.\n \"\"\"\n self._dry_run = True\n\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n if step.is_generator:\n if not parameters:\n parameters = {}\n parameters[step_name] = {\"batch_size\": batch_size}\n\n distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n self._dry_run = False\n return distiset\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_load_stages","title":"get_load_stages(load_groups=None) ","text":"Convenient method to get the load stages of a pipeline. Parameters: Name Type Description Default load_groups Optional[LoadGroups] A list containing list of steps that has to be loaded together and in isolation with respect to the rest of the steps of the pipeline. Defaults to None . None Returns: Type Description LoadStages A tuple with the first element containing asorted list by stage containing LoadStages lists with the names of the steps of the stage, and the second element a list LoadStages sorted by stage containing lists with the names of the last steps of the stage. Source code in src/distilabel/pipeline/base.py def get_load_stages(self, load_groups: Optional[\"LoadGroups\"] = None) -> LoadStages:\n \"\"\"Convenient method to get the load stages of a pipeline.\n\n Args:\n load_groups: A list containing list of steps that has to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n Defaults to `None`.\n\n Returns:\n A tuple with the first element containing asorted list by stage containing\n lists with the names of the steps of the stage, and the second element a list\n sorted by stage containing lists with the names of the last steps of the stage.\n \"\"\"\n load_groups = self._built_load_groups(load_groups)\n return self.dag.get_steps_load_stages(load_groups)\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Get the runtime parameters for the steps in the pipeline. Returns: Type Description PipelineRuntimeParametersInfo A dictionary with the step name as the key and a list of dictionaries with PipelineRuntimeParametersInfo the parameter name and the parameter info as the value. Source code in src/distilabel/pipeline/base.py def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n Returns:\n A dictionary with the step name as the key and a list of dictionaries with\n the parameter name and the parameter info as the value.\n \"\"\"\n runtime_parameters = {}\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n runtime_parameters[step_name] = step.get_runtime_parameters_info()\n return runtime_parameters\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.draw","title":"draw(path='pipeline.png', top_to_bottom=False, show_edge_labels=True) ","text":"Draws the pipeline. Parameters: Name Type Description Default path Optional[Union[str, Path]] The path to save the image to. 'pipeline.png' top_to_bottom bool Whether to draw the DAG top to bottom. Defaults to False . False show_edge_labels bool Whether to show the edge labels. Defaults to True . True Returns: Type Description str The path to the saved image. Source code in src/distilabel/pipeline/base.py def draw(\n self,\n path: Optional[Union[str, Path]] = \"pipeline.png\",\n top_to_bottom: bool = False,\n show_edge_labels: bool = True,\n) -> str:\n \"\"\"\n Draws the pipeline.\n\n Parameters:\n path: The path to save the image to.\n top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n Returns:\n The path to the saved image.\n \"\"\"\n png = self.dag.draw(\n top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n )\n with open(path, \"wb\") as f:\n f.write(png)\n return path\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__repr__","title":"__repr__() ","text":"If running in a Jupyter notebook, display an image representing this Pipeline . Source code in src/distilabel/pipeline/base.py def __repr__(self) -> str:\n \"\"\"\n If running in a Jupyter notebook, display an image representing this `Pipeline`.\n \"\"\"\n if in_notebook():\n try:\n from IPython.display import Image, display\n\n image_data = self.dag.draw()\n\n display(Image(image_data))\n except Exception:\n pass\n return super().__repr__()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.from_dict","title":"from_dict(data) classmethod ","text":"Create a Pipeline from a dict containing the serialized data. Note It's intended for internal use. Parameters: Name Type Description Default data Dict[str, Any] Dictionary containing the serialized data from a Pipeline. required Returns: Name Type Description BasePipeline Self Pipeline recreated from the dictionary info. Source code in src/distilabel/pipeline/base.py @classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n Note:\n It's intended for internal use.\n\n Args:\n data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n Returns:\n BasePipeline: Pipeline recreated from the dictionary info.\n \"\"\"\n name = data[\"pipeline\"][\"name\"]\n description = data[\"pipeline\"].get(\"description\")\n requirements = data.get(\"requirements\", [])\n with cls(name=name, description=description, requirements=requirements) as pipe:\n pipe.dag = DAG.from_dict(data[\"pipeline\"])\n return pipe\n "},{"location":"api/pipeline/#distilabel.pipeline.local","title":"local ","text":""},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline","title":"Pipeline ","text":" Bases: BasePipeline Local pipeline implementation using multiprocessing . Source code in src/distilabel/pipeline/local.py class Pipeline(BasePipeline):\n \"\"\"Local pipeline implementation using `multiprocessing`.\"\"\"\n\n def ray(\n self,\n ray_head_node_url: Optional[str] = None,\n ray_init_kwargs: Optional[Dict[str, Any]] = None,\n ) -> RayPipeline:\n \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n and it's mainly used by the CLI.\n\n Args:\n ray_head_node_url: The URL that can be used to connect to the head node of\n the Ray cluster. Normally, you won't want to use this argument as the\n recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n Defaults to `None`.\n ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n to `None`.\n\n Returns:\n A `RayPipeline` instance.\n \"\"\"\n pipeline = RayPipeline(\n name=self.name,\n description=self.description,\n cache_dir=self._cache_dir,\n enable_metadata=self._enable_metadata,\n requirements=self.requirements,\n ray_head_node_url=ray_head_node_url,\n ray_init_kwargs=ray_init_kwargs,\n )\n pipeline.dag = self.dag\n return pipeline\n\n def run(\n self,\n parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n ) -> \"Distiset\":\n \"\"\"Runs the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n\n Raises:\n RuntimeError: If the pipeline fails to load all the steps.\n \"\"\"\n if script_executed_in_ray_cluster():\n print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n return self.ray().run(\n parameters=parameters,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n )\n\n self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n if distiset := super().run(\n parameters=parameters,\n load_groups=load_groups,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n logging_handlers=logging_handlers,\n ):\n return distiset\n\n num_processes = self.dag.get_total_replica_count()\n with (\n mp.Manager() as manager,\n _NoDaemonPool(\n num_processes,\n initializer=_init_worker,\n initargs=(\n self._log_queue,\n self.name,\n self.signature,\n ),\n ) as pool,\n ):\n self._manager = manager\n self._pool = pool\n self._output_queue = self.QueueClass()\n self._load_queue = self.QueueClass()\n self._handle_keyboard_interrupt()\n\n # Run the loop for receiving the load status of each step\n self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n # Start a loop to receive the output batches from the steps\n self._output_queue_thread = self._run_output_queue_loop_in_thread()\n self._output_queue_thread.join()\n\n self._teardown()\n\n if self._exception:\n raise self._exception\n\n distiset = create_distiset(\n self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n\n stop_logging()\n\n return distiset\n\n @property\n def QueueClass(self) -> Callable:\n \"\"\"The callable used to create the input and output queues.\n\n Returns:\n The callable to create a `Queue`.\n \"\"\"\n assert self._manager, \"Manager is not initialized\"\n return self._manager.Queue\n\n def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n \"\"\"Runs the `Step` wrapped in a `_ProcessWrapper` in a separate process of the\n `Pool`.\n\n Args:\n step: The step to run.\n input_queue: The input queue to send the data to the step.\n replica: The replica ID assigned.\n \"\"\"\n assert self._pool, \"Pool is not initialized\"\n\n step_wrapper = _StepWrapper(\n step=step, # type: ignore\n replica=replica,\n input_queue=input_queue,\n output_queue=self._output_queue,\n load_queue=self._load_queue,\n dry_run=self._dry_run,\n ray_pipeline=False,\n )\n\n self._pool.apply_async(step_wrapper.run, error_callback=self._error_callback)\n\n def _error_callback(self, e: BaseException) -> None:\n \"\"\"Error callback that will be called when an error occurs in a `Step` process.\n\n Args:\n e: The exception raised by the process.\n \"\"\"\n global _SUBPROCESS_EXCEPTION\n\n # First we check that the exception is a `_StepWrapperException`, otherwise, we\n # print it out and stop the pipeline, since some errors may be unhandled\n if not isinstance(e, _StepWrapperException):\n self._logger.error(f\"\u274c Failed with an unhandled exception: {e}\")\n self._stop()\n return\n\n if e.is_load_error:\n self._logger.error(f\"\u274c Failed to load step '{e.step.name}': {e.message}\")\n _SUBPROCESS_EXCEPTION = e.subprocess_exception\n _SUBPROCESS_EXCEPTION.__traceback__ = tblib.Traceback.from_string( # type: ignore\n e.formatted_traceback\n ).as_traceback()\n return\n\n # If the step is global, is not in the last trophic level and has no successors,\n # then we can ignore the error and continue executing the pipeline\n step_name: str = e.step.name # type: ignore\n if (\n e.step.is_global\n and not self.dag.step_in_last_trophic_level(step_name)\n and list(self.dag.get_step_successors(step_name)) == []\n ):\n self._logger.error(\n f\"\u270b An error occurred when running global step '{step_name}' with no\"\n \" successors and not in the last trophic level. Pipeline execution can\"\n f\" continue. Error will be ignored.\"\n )\n self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n return\n\n # Handle tasks using an `LLM` using offline batch generation\n if isinstance(\n e.subprocess_exception, DistilabelOfflineBatchGenerationNotFinishedException\n ):\n self._logger.info(\n f\"\u23f9\ufe0f '{e.step.name}' task stopped pipeline execution: LLM offline batch\"\n \" generation in progress. Rerun pipeline with cache to check results and\"\n \" continue execution.\"\n )\n self._set_step_for_recovering_offline_batch_generation(e.step, e.data) # type: ignore\n with self._stop_called_lock:\n if not self._stop_called:\n self._stop(acquire_lock=False)\n return\n\n # Global step with successors failed\n self._logger.error(f\"An error occurred in global step '{step_name}'\")\n self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n\n self._stop()\n\n def _teardown(self) -> None:\n \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n if self._write_buffer:\n self._write_buffer.close()\n\n if self._batch_manager:\n self._batch_manager = None\n\n self._stop_load_queue_loop()\n self._load_steps_thread.join()\n\n if self._pool:\n self._pool.terminate()\n self._pool.join()\n\n if self._manager:\n self._manager.shutdown()\n self._manager.join()\n\n def _set_steps_not_loaded_exception(self) -> None:\n \"\"\"Raises a `RuntimeError` notifying that the steps load has failed.\n\n Raises:\n RuntimeError: containing the information and why a step failed to be loaded.\n \"\"\"\n self._exception = RuntimeError(\n \"Failed to load all the steps. Could not run pipeline.\"\n )\n self._exception.__cause__ = _SUBPROCESS_EXCEPTION\n\n def _stop(self, acquire_lock: bool = True) -> None:\n \"\"\"Stops the pipeline execution. It will first send `None` to the input queues\n of all the steps and then wait until the output queue is empty i.e. all the steps\n finished processing the batches that were sent before the stop flag. Then it will\n send `None` to the output queue to notify the pipeline to stop.\n\n Args:\n acquire_lock: Whether to acquire the lock to access the `_stop_called` attribute.\n \"\"\"\n\n if acquire_lock:\n self._stop_called_lock.acquire()\n\n if self._stop_called:\n self._stop_calls += 1\n if self._stop_calls == 1:\n self._logger.warning(\"\ud83d\uded1 Press again to force the pipeline to stop.\")\n elif self._stop_calls > 1:\n self._logger.warning(\"\ud83d\uded1 Forcing pipeline interruption.\")\n\n if self._pool:\n self._pool.terminate()\n self._pool.join()\n self._pool = None\n\n if self._manager:\n self._manager.shutdown()\n self._manager.join()\n self._manager = None\n\n stop_logging()\n\n sys.exit(1)\n\n return\n self._stop_called = True\n\n if acquire_lock:\n self._stop_called_lock.release()\n\n self._logger.debug(\n f\"Steps loaded before calling `stop`: {self._steps_load_status}\"\n )\n self._logger.info(\n \"\ud83d\uded1 Stopping pipeline. Waiting for steps to finish processing batches...\"\n )\n\n self._stop_output_queue_loop()\n "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.QueueClass","title":"QueueClass: Callable property ","text":"The callable used to create the input and output queues. Returns: Type Description Callable The callable to create a Queue . "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.ray","title":"ray(ray_head_node_url=None, ray_init_kwargs=None) ","text":"Creates a RayPipeline using the init parameters of this pipeline. This is a convenient method that can be used to \"transform\" one common Pipeline to a RayPipeline and it's mainly used by the CLI. Parameters: Name Type Description Default ray_head_node_url Optional[str] The URL that can be used to connect to the head node of the Ray cluster. Normally, you won't want to use this argument as the recommended way to submit a job to a Ray cluster is using the Ray Jobs CLI. Defaults to None . None ray_init_kwargs Optional[Dict[str, Any]] kwargs that will be passed to the ray.init method. Defaults to None . None Returns: Type Description RayPipeline A RayPipeline instance. Source code in src/distilabel/pipeline/local.py def ray(\n self,\n ray_head_node_url: Optional[str] = None,\n ray_init_kwargs: Optional[Dict[str, Any]] = None,\n) -> RayPipeline:\n \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n and it's mainly used by the CLI.\n\n Args:\n ray_head_node_url: The URL that can be used to connect to the head node of\n the Ray cluster. Normally, you won't want to use this argument as the\n recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n Defaults to `None`.\n ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n to `None`.\n\n Returns:\n A `RayPipeline` instance.\n \"\"\"\n pipeline = RayPipeline(\n name=self.name,\n description=self.description,\n cache_dir=self._cache_dir,\n enable_metadata=self._enable_metadata,\n requirements=self.requirements,\n ray_head_node_url=ray_head_node_url,\n ray_init_kwargs=ray_init_kwargs,\n )\n pipeline.dag = self.dag\n return pipeline\n "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.run","title":"run(parameters=None, load_groups=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None) ","text":"Runs the pipeline. Parameters: Name Type Description Default parameters Optional[Dict[Any, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None load_groups Optional[LoadGroups] A list containing lists of steps that have to be loaded together and in isolation with respect to the rest of the steps of the pipeline. This argument also allows passing the following modes: - \"sequential_step_execution\": each step will be executed in a stage i.e. the execution of the steps will be sequential.
Defaults to None . None use_cache bool Whether to use the cache from previous pipeline runs. Defaults to True . True storage_parameters Optional[Dict[str, Any]] A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batch es passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None . None use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . False dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None dataset_batch_size int if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset . Defaults to 50 . 50 logging_handlers Optional[List[Handler]] A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None . None Returns: Type Description Distiset The Distiset created by the pipeline. Raises: Type Description RuntimeError If the pipeline fails to load all the steps. Source code in src/distilabel/pipeline/local.py def run(\n self,\n parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n) -> \"Distiset\":\n \"\"\"Runs the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n\n Raises:\n RuntimeError: If the pipeline fails to load all the steps.\n \"\"\"\n if script_executed_in_ray_cluster():\n print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n return self.ray().run(\n parameters=parameters,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n )\n\n self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n if distiset := super().run(\n parameters=parameters,\n load_groups=load_groups,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n logging_handlers=logging_handlers,\n ):\n return distiset\n\n num_processes = self.dag.get_total_replica_count()\n with (\n mp.Manager() as manager,\n _NoDaemonPool(\n num_processes,\n initializer=_init_worker,\n initargs=(\n self._log_queue,\n self.name,\n self.signature,\n ),\n ) as pool,\n ):\n self._manager = manager\n self._pool = pool\n self._output_queue = self.QueueClass()\n self._load_queue = self.QueueClass()\n self._handle_keyboard_interrupt()\n\n # Run the loop for receiving the load status of each step\n self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n # Start a loop to receive the output batches from the steps\n self._output_queue_thread = self._run_output_queue_loop_in_thread()\n self._output_queue_thread.join()\n\n self._teardown()\n\n if self._exception:\n raise self._exception\n\n distiset = create_distiset(\n self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n\n stop_logging()\n\n return distiset\n "},{"location":"api/pipeline/routing_batch_function/","title":"Routing batch function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function","title":"routing_batch_function ","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunc","title":"RoutingBatchFunc = Callable[[List[str]], List[str]] module-attribute ","text":"Type alias for a routing batch function. It takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch. "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction","title":"RoutingBatchFunction ","text":" Bases: BaseModel , _Serializable A thin wrapper around a routing batch function that can be used to route batches from one upstream step to specific downstream steps. Attributes: Name Type Description routing_function RoutingBatchFunc The routing function that takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch. _step Union[_Step, None] The upstream step that is connected to the routing batch function. _routed_batch_registry Dict[str, Dict[int, List[str]]] A dictionary that keeps track of the batches that have been routed to specific downstream steps. Source code in src/distilabel/pipeline/routing_batch_function.py class RoutingBatchFunction(BaseModel, _Serializable):\n \"\"\"A thin wrapper around a routing batch function that can be used to route batches\n from one upstream step to specific downstream steps.\n\n Attributes:\n routing_function: The routing function that takes a list of all the downstream steps\n and returns a list with the names of the steps that should receive the batch.\n _step: The upstream step that is connected to the routing batch function.\n _routed_batch_registry: A dictionary that keeps track of the batches that have been\n routed to specific downstream steps.\n \"\"\"\n\n routing_function: RoutingBatchFunc\n description: Optional[str] = None\n\n _step: Union[\"_Step\", None] = PrivateAttr(default=None)\n _routed_batch_registry: Dict[str, Dict[int, List[str]]] = PrivateAttr(\n default_factory=dict\n )\n _factory_function_module: Union[str, None] = PrivateAttr(default=None)\n _factory_function_name: Union[str, None] = PrivateAttr(default=None)\n _factory_function_kwargs: Union[Dict[str, Any], None] = PrivateAttr(default=None)\n\n def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n routed_steps = self.routing_function(steps)\n self._register_routed_batch(batch, routed_steps)\n return routed_steps\n\n def set_factory_function(\n self,\n factory_function_module: str,\n factory_function_name: str,\n factory_function_kwargs: Dict[str, Any],\n ) -> None:\n \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n Args:\n factory_function_module: The module name where the factory function is defined.\n factory_function_name: The name of the factory function that was used to create\n the `routing_batch_function`.\n factory_function_kwargs: The keyword arguments that were used when calling the\n factory function.\n \"\"\"\n self._factory_function_module = factory_function_module\n self._factory_function_name = factory_function_name\n self._factory_function_kwargs = factory_function_kwargs\n\n def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n return self.route_batch(batch, steps)\n\n def _register_routed_batch(self, batch: \"_Batch\", routed_steps: List[str]) -> None:\n \"\"\"Registers a batch that has been routed to specific downstream steps.\n\n Args:\n batch: The batch that has been routed.\n routed_steps: The list of downstream steps that have been selected to receive\n the batch.\n \"\"\"\n upstream_step = batch.step_name\n batch_seq_no = batch.seq_no\n self._routed_batch_registry.setdefault(upstream_step, {}).setdefault(\n batch_seq_no, routed_steps\n )\n\n def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n ) -> List[\"DownstreamConnectableSteps\"]:\n \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n function.\n\n Args:\n other: A list of downstream steps that should be connected to the upstream step\n of the routing batch function.\n\n Returns:\n The list of downstream steps that have been connected to the upstream step of the\n routing batch function.\n \"\"\"\n if not isinstance(other, list):\n raise DistilabelUserError(\n f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n \" expression. It should be\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n if not self._step:\n raise DistilabelUserError(\n \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n \" steps before connecting the upstream step. Connect this routing batch\"\n \" function to an upstream step using the `>>` operator. For example:\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n for step in other:\n self._step.connect(step)\n return other\n\n def dump(self, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n factory function used to create this routing batch function.\n\n Args:\n **kwargs: Additional keyword arguments that should be included in the dump.\n\n Returns:\n A dictionary with the routing batch function information and the factory function\n information.\n \"\"\"\n dump_info: Dict[str, Any] = {\"step\": self._step.name} # type: ignore\n\n if self.description:\n dump_info[\"description\"] = self.description\n\n if type_info := self._get_type_info():\n dump_info[TYPE_INFO_KEY] = type_info\n\n return dump_info\n\n def _get_type_info(self) -> Dict[str, Any]:\n \"\"\"Returns the information of the factory function used to create the routing batch\n function.\n\n Returns:\n A dictionary with the factory function information.\n \"\"\"\n\n type_info = {}\n\n if self._factory_function_module:\n type_info[\"module\"] = self._factory_function_module\n\n if self._factory_function_name:\n type_info[\"name\"] = self._factory_function_name\n\n if self._factory_function_kwargs:\n type_info[\"kwargs\"] = self._factory_function_kwargs\n\n return type_info\n\n @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n of the factory function used to create the routing batch function.\n\n Args:\n data: A dictionary with the routing batch function information and the factory\n function information.\n \"\"\"\n type_info = data.get(TYPE_INFO_KEY)\n if not type_info:\n step = data.get(\"step\")\n raise ValueError(\n f\"The routing batch function for step '{step}' was created without a factory\"\n \" function, and it cannot be reconstructed.\"\n )\n\n module = type_info.get(\"module\")\n name = type_info.get(\"name\")\n kwargs = type_info.get(\"kwargs\")\n\n if not module or not name or not kwargs:\n raise ValueError(\n \"The routing batch function was created with a factory function, but the\"\n \" information is incomplete. Cannot reconstruct the routing batch function.\"\n )\n\n routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n routing_batch_function.description = data.get(\"description\")\n routing_batch_function.set_factory_function(\n factory_function_module=module,\n factory_function_name=name,\n factory_function_kwargs=kwargs,\n )\n\n return routing_batch_function\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.route_batch","title":"route_batch(batch, steps) ","text":"Returns a list of selected downstream steps from steps to which the batch should be routed. Parameters: Name Type Description Default batch _Batch The batch that should be routed. required steps List[str] A list of all the downstream steps that can receive the batch. required Returns: Type Description List[str] A list with the names of the steps that should receive the batch. Source code in src/distilabel/pipeline/routing_batch_function.py def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n routed_steps = self.routing_function(steps)\n self._register_routed_batch(batch, routed_steps)\n return routed_steps\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.set_factory_function","title":"set_factory_function(factory_function_module, factory_function_name, factory_function_kwargs) ","text":"Sets the factory function that was used to create the routing_batch_function . Parameters: Name Type Description Default factory_function_module str The module name where the factory function is defined. required factory_function_name str The name of the factory function that was used to create the routing_batch_function . required factory_function_kwargs Dict[str, Any] The keyword arguments that were used when calling the factory function. required Source code in src/distilabel/pipeline/routing_batch_function.py def set_factory_function(\n self,\n factory_function_module: str,\n factory_function_name: str,\n factory_function_kwargs: Dict[str, Any],\n) -> None:\n \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n Args:\n factory_function_module: The module name where the factory function is defined.\n factory_function_name: The name of the factory function that was used to create\n the `routing_batch_function`.\n factory_function_kwargs: The keyword arguments that were used when calling the\n factory function.\n \"\"\"\n self._factory_function_module = factory_function_module\n self._factory_function_name = factory_function_name\n self._factory_function_kwargs = factory_function_kwargs\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__call__","title":"__call__(batch, steps) ","text":"Returns a list of selected downstream steps from steps to which the batch should be routed. Parameters: Name Type Description Default batch _Batch The batch that should be routed. required steps List[str] A list of all the downstream steps that can receive the batch. required Returns: Type Description List[str] A list with the names of the steps that should receive the batch. Source code in src/distilabel/pipeline/routing_batch_function.py def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n return self.route_batch(batch, steps)\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__rshift__","title":"__rshift__(other) ","text":"Connects a list of dowstream steps to the upstream step of the routing batch function. Parameters: Name Type Description Default other List[DownstreamConnectableSteps] A list of downstream steps that should be connected to the upstream step of the routing batch function. required Returns: Type Description List[DownstreamConnectableSteps] The list of downstream steps that have been connected to the upstream step of the List[DownstreamConnectableSteps] routing batch function. Source code in src/distilabel/pipeline/routing_batch_function.py def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n) -> List[\"DownstreamConnectableSteps\"]:\n \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n function.\n\n Args:\n other: A list of downstream steps that should be connected to the upstream step\n of the routing batch function.\n\n Returns:\n The list of downstream steps that have been connected to the upstream step of the\n routing batch function.\n \"\"\"\n if not isinstance(other, list):\n raise DistilabelUserError(\n f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n \" expression. It should be\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n if not self._step:\n raise DistilabelUserError(\n \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n \" steps before connecting the upstream step. Connect this routing batch\"\n \" function to an upstream step using the `>>` operator. For example:\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n for step in other:\n self._step.connect(step)\n return other\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.dump","title":"dump(**kwargs) ","text":"Dumps the routing batch function to a dictionary, and the information of the factory function used to create this routing batch function. Parameters: Name Type Description Default **kwargs Any Additional keyword arguments that should be included in the dump. {} Returns: Type Description Dict[str, Any] A dictionary with the routing batch function information and the factory function Dict[str, Any] information. Source code in src/distilabel/pipeline/routing_batch_function.py def dump(self, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n factory function used to create this routing batch function.\n\n Args:\n **kwargs: Additional keyword arguments that should be included in the dump.\n\n Returns:\n A dictionary with the routing batch function information and the factory function\n information.\n \"\"\"\n dump_info: Dict[str, Any] = {\"step\": self._step.name} # type: ignore\n\n if self.description:\n dump_info[\"description\"] = self.description\n\n if type_info := self._get_type_info():\n dump_info[TYPE_INFO_KEY] = type_info\n\n return dump_info\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.from_dict","title":"from_dict(data) classmethod ","text":"Loads a routing batch function from a dictionary. It must contain the information of the factory function used to create the routing batch function. Parameters: Name Type Description Default data Dict[str, Any] A dictionary with the routing batch function information and the factory function information. required Source code in src/distilabel/pipeline/routing_batch_function.py @classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n of the factory function used to create the routing batch function.\n\n Args:\n data: A dictionary with the routing batch function information and the factory\n function information.\n \"\"\"\n type_info = data.get(TYPE_INFO_KEY)\n if not type_info:\n step = data.get(\"step\")\n raise ValueError(\n f\"The routing batch function for step '{step}' was created without a factory\"\n \" function, and it cannot be reconstructed.\"\n )\n\n module = type_info.get(\"module\")\n name = type_info.get(\"name\")\n kwargs = type_info.get(\"kwargs\")\n\n if not module or not name or not kwargs:\n raise ValueError(\n \"The routing batch function was created with a factory function, but the\"\n \" information is incomplete. Cannot reconstruct the routing batch function.\"\n )\n\n routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n routing_batch_function.description = data.get(\"description\")\n routing_batch_function.set_factory_function(\n factory_function_module=module,\n factory_function_name=name,\n factory_function_kwargs=kwargs,\n )\n\n return routing_batch_function\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.routing_batch_function","title":"routing_batch_function(description=None) ","text":"Creates a routing batch function that can be used to route batches from one upstream step to specific downstream steps. Parameters: Name Type Description Default description Optional[str] An optional description for the routing batch function. None Returns: Type Description Callable[[RoutingBatchFunc], RoutingBatchFunction] A RoutingBatchFunction instance that can be used with the >> operators and with Callable[[RoutingBatchFunc], RoutingBatchFunction] the Pipeline.connect method when defining the pipeline. Example: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n@routing_batch_function\ndef random_routing_batch(steps: List[str]) -> List[str]:\n return random.sample(steps, 2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n Source code in src/distilabel/pipeline/routing_batch_function.py def routing_batch_function(\n description: Optional[str] = None,\n) -> Callable[[RoutingBatchFunc], RoutingBatchFunction]:\n \"\"\"Creates a routing batch function that can be used to route batches from one upstream\n step to specific downstream steps.\n\n Args:\n description: An optional description for the routing batch function.\n\n Returns:\n A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n the `Pipeline.connect` method when defining the pipeline.\n\n Example:\n\n ```python\n from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n from distilabel.pipeline import Pipeline, routing_batch_function\n from distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n @routing_batch_function\n def random_routing_batch(steps: List[str]) -> List[str]:\n return random.sample(steps, 2)\n\n\n with Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n ```\n \"\"\"\n\n def decorator(func: RoutingBatchFunc) -> RoutingBatchFunction:\n factory_function_name, factory_function_module, factory_function_kwargs = (\n None,\n None,\n None,\n )\n\n # Check if `routing_batch_function` was created using a factory function from an installed package\n stack = inspect.stack()\n if len(stack) > 2:\n factory_function_frame_info = stack[1]\n\n # Function factory path\n if factory_function_frame_info.function != \"<module>\":\n factory_function_name = factory_function_frame_info.function\n factory_function_module = inspect.getmodule(\n factory_function_frame_info.frame\n ).__name__ # type: ignore\n\n # Function factory kwargs\n factory_function_kwargs = factory_function_frame_info.frame.f_locals\n\n routing_batch_function = RoutingBatchFunction(\n routing_function=func,\n description=description,\n )\n\n if (\n factory_function_module\n and factory_function_name\n and factory_function_kwargs\n ):\n routing_batch_function.set_factory_function(\n factory_function_module=factory_function_module,\n factory_function_name=factory_function_name,\n factory_function_kwargs=factory_function_kwargs,\n )\n\n return routing_batch_function\n\n return decorator\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.sample_n_steps","title":"sample_n_steps(n) ","text":"A simple function that creates a routing batch function that samples n steps from the list of all the downstream steps. Parameters: Name Type Description Default n int The number of steps to sample from the list of all the downstream steps. required Returns: Type Description RoutingBatchFunction A RoutingBatchFunction instance that can be used with the >> operators and with RoutingBatchFunction the Pipeline.connect method when defining the pipeline. Example: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\nrandom_routing_batch = sample_n_steps(2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n Source code in src/distilabel/pipeline/routing_batch_function.py def sample_n_steps(n: int) -> RoutingBatchFunction:\n \"\"\"A simple function that creates a routing batch function that samples `n` steps from\n the list of all the downstream steps.\n\n Args:\n n: The number of steps to sample from the list of all the downstream steps.\n\n Returns:\n A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n the `Pipeline.connect` method when defining the pipeline.\n\n Example:\n\n ```python\n from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n from distilabel.pipeline import Pipeline, sample_n_steps\n from distilabel.steps import LoadDataFromHub, GroupColumns\n\n random_routing_batch = sample_n_steps(2)\n\n\n with Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n ```\n \"\"\"\n\n @routing_batch_function(\n description=f\"Sample {n} steps from the list of downstream steps.\"\n )\n def sample_n(steps: List[str]) -> List[str]:\n return random.sample(steps, n)\n\n return sample_n\n "},{"location":"api/pipeline/step_wrapper/","title":"Step Wrapper","text":""},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper","title":"_StepWrapper ","text":"Wrapper to run the Step . Attributes: Name Type Description step The step to run. replica The replica ID assigned. input_queue The queue to receive the input data. output_queue The queue to send the output data. load_queue The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load. Source code in src/distilabel/pipeline/step_wrapper.py class _StepWrapper:\n \"\"\"Wrapper to run the `Step`.\n\n Attributes:\n step: The step to run.\n replica: The replica ID assigned.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been loaded,\n has been unloaded or has failed to load.\n \"\"\"\n\n def __init__(\n self,\n step: Union[\"Step\", \"GeneratorStep\"],\n replica: int,\n input_queue: \"Queue[_Batch]\",\n output_queue: \"Queue[_Batch]\",\n load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n dry_run: bool = False,\n ray_pipeline: bool = False,\n ) -> None:\n \"\"\"Initializes the `_ProcessWrapper`.\n\n Args:\n step: The step to run.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been\n loaded, has been unloaded or has failed to load.\n dry_run: Flag to ensure we are forcing to run the last batch.\n ray_pipeline: Whether the step is running a `RayPipeline` or not.\n \"\"\"\n self.step = step\n self.replica = replica\n self.input_queue = input_queue\n self.output_queue = output_queue\n self.load_queue = load_queue\n self.dry_run = dry_run\n self.ray_pipeline = ray_pipeline\n\n self._init_cuda_device_placement()\n\n def _init_cuda_device_placement(self) -> None:\n \"\"\"Sets the LLM identifier and the number of desired GPUs of the `CudaDevicePlacementMixin`\"\"\"\n\n def _init_cuda_device_placement_mixin(attr: CudaDevicePlacementMixin) -> None:\n if self.ray_pipeline:\n attr.disable_cuda_device_placement = True\n else:\n desired_num_gpus = self.step.resources.gpus or 1\n attr._llm_identifier = f\"{self.step.name}-replica-{self.replica}\"\n attr._desired_num_gpus = desired_num_gpus\n\n for field_name in self.step.model_fields_set:\n attr = getattr(self.step, field_name)\n if isinstance(attr, CudaDevicePlacementMixin):\n _init_cuda_device_placement_mixin(attr)\n\n if isinstance(self.step, CudaDevicePlacementMixin):\n _init_cuda_device_placement_mixin(self.step)\n\n def run(self) -> str:\n \"\"\"The target function executed by the process. This function will also handle\n the step lifecycle, executing first the `load` function of the `Step` and then\n waiting to receive a batch from the `input_queue` that will be handled by the\n `process` method of the `Step`.\n\n Returns:\n The name of the step that was executed.\n \"\"\"\n\n try:\n self.step.load()\n self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n except Exception as e:\n self.step.unload()\n self._notify_load_failed()\n raise _StepWrapperException.create_load_error(\n message=f\"Step load failed: {e}\",\n step=self.step,\n subprocess_exception=e,\n ) from e\n\n self._notify_load()\n\n if self.step.is_generator:\n self._generator_step_process_loop()\n else:\n self._non_generator_process_loop()\n\n # Just in case `None` sentinel was sent\n try:\n self.input_queue.get(block=False)\n except Exception:\n pass\n\n self.step.unload()\n\n self._notify_unload()\n\n self.step._logger.info(\n f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n )\n\n return self.step.name # type: ignore\n\n def _notify_load(self) -> None:\n \"\"\"Notifies that the step has finished executing its `load` function successfully.\"\"\"\n self.step._logger.debug(\n f\"Notifying load of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"loaded\"}) # type: ignore\n\n def _notify_unload(self) -> None:\n \"\"\"Notifies that the step has been unloaded.\"\"\"\n self.step._logger.debug(\n f\"Notifying unload of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"unloaded\"}) # type: ignore\n\n def _notify_load_failed(self) -> None:\n \"\"\"Notifies that the step failed to load.\"\"\"\n self.step._logger.debug(\n f\"Notifying load failed of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"load_failed\"}) # type: ignore\n\n def _generator_step_process_loop(self) -> None:\n \"\"\"Runs the process loop for a generator step. It will call the `process` method\n of the step and send the output data to the `output_queue` and block until the next\n batch request is received (i.e. receiving an empty batch from the `input_queue`).\n\n If the `last_batch` attribute of the batch is `True`, the loop will stop and the\n process will finish.\n\n Raises:\n _StepWrapperException: If an error occurs during the execution of the\n `process` method.\n \"\"\"\n step = cast(\"GeneratorStep\", self.step)\n\n try:\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n )\n return\n\n offset = batch.seq_no * step.batch_size # type: ignore\n\n self.step._logger.info(\n f\"\ud83d\udeb0 Starting yielding batches from generator step '{self.step.name}'.\"\n f\" Offset: {offset}\"\n )\n\n for data, last_batch in step.process_applying_mappings(offset=offset):\n batch.set_data([data])\n batch.last_batch = self.dry_run or last_batch\n self._send_batch(batch)\n\n if batch.last_batch:\n return\n\n self.step._logger.debug(\n f\"Step '{self.step.name}' waiting for next batch request...\"\n )\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n )\n return\n except Exception as e:\n raise _StepWrapperException(str(e), self.step, 2, e) from e\n\n def _non_generator_process_loop(self) -> None:\n \"\"\"Runs the process loop for a non-generator step. It will call the `process`\n method of the step and send the output data to the `output_queue` and block until\n the next batch is received from the `input_queue`. If the `last_batch` attribute\n of the batch is `True`, the loop will stop and the process will finish.\n\n If an error occurs during the execution of the `process` method and the step is\n global, the process will raise a `_StepWrapperException`. If the step is not\n global, the process will log the error and send an empty batch to the `output_queue`.\n\n Raises:\n _StepWrapperException: If an error occurs during the execution of the\n `process` method and the step is global.\n \"\"\"\n step = cast(\"Step\", self.step)\n while True:\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping processing batches from step '{self.step.name}'\"\n )\n break\n\n if batch == LAST_BATCH_SENT_FLAG:\n self.step._logger.debug(\"Received `LAST_BATCH_SENT_FLAG`. Stopping...\")\n break\n\n self.step._logger.info(\n f\"\ud83d\udce6 Processing batch {batch.seq_no} in '{batch.step_name}' (replica ID: {self.replica})\"\n )\n\n if batch.data_path is not None:\n self.step._logger.debug(f\"Reading batch data from '{batch.data_path}'\")\n batch.read_batch_data_from_fs()\n\n result = []\n try:\n if self.step.has_multiple_inputs:\n result = next(step.process_applying_mappings(*batch.data))\n else:\n result = next(step.process_applying_mappings(batch.data[0]))\n except Exception as e:\n if self.step.is_global:\n self.step.unload()\n self._notify_unload()\n data = (\n batch.data\n if isinstance(\n e, DistilabelOfflineBatchGenerationNotFinishedException\n )\n else None\n )\n raise _StepWrapperException(str(e), self.step, 2, e, data) from e\n\n # Impute step outputs columns with `None`\n result = self._impute_step_outputs(batch)\n\n # if the step is not global then we can skip the batch which means sending\n # an empty batch to the output queue\n self.step._logger.warning(\n f\"\u26a0\ufe0f Processing batch {batch.seq_no} with step '{self.step.name}' failed.\"\n \" Sending empty batch filled with `None`s...\"\n )\n self.step._logger.warning(\n f\"Subprocess traceback:\\n\\n{traceback.format_exc()}\"\n )\n finally:\n batch.set_data([result])\n self._send_batch(batch)\n\n if batch.last_batch:\n break\n\n def _impute_step_outputs(self, batch: \"_Batch\") -> List[Dict[str, Any]]:\n \"\"\"Imputes the step outputs columns with `None` in the batch data.\n\n Args:\n batch: The batch to impute.\n \"\"\"\n return self.step.impute_step_outputs(batch.data[0])\n\n def _send_batch(self, batch: _Batch) -> None:\n \"\"\"Sends a batch to the `output_queue`.\"\"\"\n if batch.data_path is not None:\n self.step._logger.debug(f\"Writing batch data to '{batch.data_path}'\")\n batch.write_batch_data_to_fs()\n\n self.step._logger.info(\n f\"\ud83d\udce8 Step '{batch.step_name}' sending batch {batch.seq_no} to output queue\"\n )\n self.output_queue.put(batch)\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.__init__","title":"__init__(step, replica, input_queue, output_queue, load_queue, dry_run=False, ray_pipeline=False) ","text":"Initializes the _ProcessWrapper . Parameters: Name Type Description Default step Union[Step, GeneratorStep] The step to run. required input_queue Queue[_Batch] The queue to receive the input data. required output_queue Queue[_Batch] The queue to send the output data. required load_queue Queue[Union[StepLoadStatus, None]] The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load. required dry_run bool Flag to ensure we are forcing to run the last batch. False ray_pipeline bool Whether the step is running a RayPipeline or not. False Source code in src/distilabel/pipeline/step_wrapper.py def __init__(\n self,\n step: Union[\"Step\", \"GeneratorStep\"],\n replica: int,\n input_queue: \"Queue[_Batch]\",\n output_queue: \"Queue[_Batch]\",\n load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n dry_run: bool = False,\n ray_pipeline: bool = False,\n) -> None:\n \"\"\"Initializes the `_ProcessWrapper`.\n\n Args:\n step: The step to run.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been\n loaded, has been unloaded or has failed to load.\n dry_run: Flag to ensure we are forcing to run the last batch.\n ray_pipeline: Whether the step is running a `RayPipeline` or not.\n \"\"\"\n self.step = step\n self.replica = replica\n self.input_queue = input_queue\n self.output_queue = output_queue\n self.load_queue = load_queue\n self.dry_run = dry_run\n self.ray_pipeline = ray_pipeline\n\n self._init_cuda_device_placement()\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.run","title":"run() ","text":"The target function executed by the process. This function will also handle the step lifecycle, executing first the load function of the Step and then waiting to receive a batch from the input_queue that will be handled by the process method of the Step . Returns: Type Description str The name of the step that was executed. Source code in src/distilabel/pipeline/step_wrapper.py def run(self) -> str:\n \"\"\"The target function executed by the process. This function will also handle\n the step lifecycle, executing first the `load` function of the `Step` and then\n waiting to receive a batch from the `input_queue` that will be handled by the\n `process` method of the `Step`.\n\n Returns:\n The name of the step that was executed.\n \"\"\"\n\n try:\n self.step.load()\n self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n except Exception as e:\n self.step.unload()\n self._notify_load_failed()\n raise _StepWrapperException.create_load_error(\n message=f\"Step load failed: {e}\",\n step=self.step,\n subprocess_exception=e,\n ) from e\n\n self._notify_load()\n\n if self.step.is_generator:\n self._generator_step_process_loop()\n else:\n self._non_generator_process_loop()\n\n # Just in case `None` sentinel was sent\n try:\n self.input_queue.get(block=False)\n except Exception:\n pass\n\n self.step.unload()\n\n self._notify_unload()\n\n self.step._logger.info(\n f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n )\n\n return self.step.name # type: ignore\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException","title":"_StepWrapperException ","text":" Bases: Exception Exception to be raised when an error occurs in the _StepWrapper class. Attributes: Name Type Description message The error message. step The Step that raised the error. code The error code. subprocess_exception The exception raised by the subprocess. data The data that caused the error. Defaults to None . Source code in src/distilabel/pipeline/step_wrapper.py class _StepWrapperException(Exception):\n \"\"\"Exception to be raised when an error occurs in the `_StepWrapper` class.\n\n Attributes:\n message: The error message.\n step: The `Step` that raised the error.\n code: The error code.\n subprocess_exception: The exception raised by the subprocess.\n data: The data that caused the error. Defaults to `None`.\n \"\"\"\n\n def __init__(\n self,\n message: str,\n step: \"_Step\",\n code: int,\n subprocess_exception: Exception,\n data: Optional[List[List[Dict[str, Any]]]] = None,\n ) -> None:\n self.message = f\"{message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}api/pipeline/step_wrapper'\"\n self.step = step\n self.code = code\n self.subprocess_exception = subprocess_exception\n self.formatted_traceback = \"\".join(\n traceback.format_exception(\n type(subprocess_exception),\n subprocess_exception,\n subprocess_exception.__traceback__,\n )\n )\n self.data = data\n\n @classmethod\n def create_load_error(\n cls,\n message: str,\n step: \"_Step\",\n subprocess_exception: Optional[Exception] = None,\n ) -> \"_StepWrapperException\":\n \"\"\"Creates a `_StepWrapperException` for a load error.\n\n Args:\n message: The error message.\n step: The `Step` that raised the error.\n subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n Returns:\n The `_StepWrapperException` instance.\n \"\"\"\n return cls(message, step, 1, subprocess_exception, None)\n\n @property\n def is_load_error(self) -> bool:\n \"\"\"Whether the error is a load error.\n\n Returns:\n `True` if the error is a load error, `False` otherwise.\n \"\"\"\n return self.code == 1\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.is_load_error","title":"is_load_error: bool property ","text":"Whether the error is a load error. Returns: Type Description bool True if the error is a load error, False otherwise. "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.create_load_error","title":"create_load_error(message, step, subprocess_exception=None) classmethod ","text":"Creates a _StepWrapperException for a load error. Parameters: Name Type Description Default message str The error message. required step _Step The Step that raised the error. required subprocess_exception Optional[Exception] The exception raised by the subprocess. Defaults to None . None Returns: Type Description _StepWrapperException The _StepWrapperException instance. Source code in src/distilabel/pipeline/step_wrapper.py @classmethod\ndef create_load_error(\n cls,\n message: str,\n step: \"_Step\",\n subprocess_exception: Optional[Exception] = None,\n) -> \"_StepWrapperException\":\n \"\"\"Creates a `_StepWrapperException` for a load error.\n\n Args:\n message: The error message.\n step: The `Step` that raised the error.\n subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n Returns:\n The `_StepWrapperException` instance.\n \"\"\"\n return cls(message, step, 1, subprocess_exception, None)\n "},{"location":"api/pipeline/typing/","title":"Pipeline Typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing","title":"typing ","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectable","title":"DownstreamConnectable = Union['Step', 'GlobalStep'] module-attribute ","text":"Alias for the Step types that can be connected as downstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.UpstreamConnectableSteps","title":"UpstreamConnectableSteps = TypeVar('UpstreamConnectableSteps', bound=Union['Step', 'GlobalStep', 'GeneratorStep']) module-attribute ","text":"Type for the Step types that can be connected as upstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectableSteps","title":"DownstreamConnectableSteps = TypeVar('DownstreamConnectableSteps', bound=DownstreamConnectable, covariant=True) module-attribute ","text":"Type for the Step types that can be connected as downstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.PipelineRuntimeParametersInfo","title":"PipelineRuntimeParametersInfo = Dict[str, Union[List['RuntimeParameterInfo'], Dict[str, 'RuntimeParameterInfo']]] module-attribute ","text":"Alias for the information of the runtime parameters of a Pipeline . "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.InputDataset","title":"InputDataset = Union['Dataset', 'pd.DataFrame', List[Dict[str, str]]] module-attribute ","text":"Alias for the types we can process as input dataset. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.LoadGroups","title":"LoadGroups = Union[List[List[Any]], Literal['sequential_step_execution']] module-attribute ","text":"Alias for the types that can be used as load groups. - if
List[List[Any]] , it's a list containing lists of steps that have to be loaded in isolation. - if \"sequential_step_execution\", each step will be loaded in a different stage i.e. only one step will be executed at a time.
"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.StepLoadStatus","title":"StepLoadStatus ","text":" Bases: TypedDict Dict containing information about if one step was loaded/unloaded or if it's load failed Source code in src/distilabel/pipeline/typing.py class StepLoadStatus(TypedDict):\n \"\"\"Dict containing information about if one step was loaded/unloaded or if it's load\n failed\"\"\"\n\n name: str\n status: Literal[\"loaded\", \"unloaded\", \"load_failed\"]\n "},{"location":"api/step/","title":"Step","text":"This section contains the API reference for the distilabel step, both for the _Step base class and the Step class. For more information and examples on how to use existing steps or create custom ones, please refer to Tutorial - Step. "},{"location":"api/step/#distilabel.steps.base","title":"base ","text":""},{"location":"api/step/#distilabel.steps.base.StepInput","title":"StepInput = Annotated[List[Dict[str, Any]], _STEP_INPUT_ANNOTATION] module-attribute ","text":"StepInput is just an Annotated alias of the typing List[Dict[str, Any]] with extra metadata that allows distilabel to perform validations over the process step method defined in each Step "},{"location":"api/step/#distilabel.steps.base._Step","title":"_Step ","text":" Bases: RuntimeParametersMixin , RequirementsMixin , SignatureMixin , BaseModel , _Serializable , ABC Base class for the steps that can be included in a Pipeline . A Step is a class defining some processing logic. The input and outputs for this processing logic are lists of dictionaries with the same keys: ```python\n[\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n]\n```\n The processing logic is defined in the process method, which depending on the number of previous steps, can receive more than one list of dictionaries, each with the output of the previous steps. In order to make distilabel know where the outputs from the previous steps are, the process function from each Step must have an argument or positional argument annotated with StepInput . ```python\nclass StepWithOnePreviousStep(Step):\n def process(self, inputs: StepInput) -> StepOutput:\n yield [...]\n\nclass StepWithSeveralPreviousStep(Step):\n # mind the * to indicate that the argument is a list of StepInput\n def process(self, *inputs: StepInput) -> StepOutput:\n yield [...]\n```\n In order to perform static validations and to check that the chaining of the steps in the pipeline is valid, a Step must also define the inputs and outputs properties: inputs : a list of strings with the names of the columns that the step needs as input. It can be an empty list if the step is a generator step. outputs : a list of strings with the names of the columns that the step will produce as output. Optionally, a Step can override the load method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Finally, the Step class inherits from pydantic.BaseModel , so attributes can be easily defined, validated, serialized and included in the __init__ method of the step. Source code in src/distilabel/steps/base.py class _Step(\n RuntimeParametersMixin,\n RequirementsMixin,\n SignatureMixin,\n BaseModel,\n _Serializable,\n ABC,\n):\n \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n A `Step` is a class defining some processing logic. The input and outputs for this\n processing logic are lists of dictionaries with the same keys:\n\n ```python\n [\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n ]\n ```\n\n The processing logic is defined in the `process` method, which depending on the\n number of previous steps, can receive more than one list of dictionaries, each with\n the output of the previous steps. In order to make `distilabel` know where the outputs\n from the previous steps are, the `process` function from each `Step` must have an argument\n or positional argument annotated with `StepInput`.\n\n ```python\n class StepWithOnePreviousStep(Step):\n def process(self, inputs: StepInput) -> StepOutput:\n yield [...]\n\n class StepWithSeveralPreviousStep(Step):\n # mind the * to indicate that the argument is a list of StepInput\n def process(self, *inputs: StepInput) -> StepOutput:\n yield [...]\n ```\n\n In order to perform static validations and to check that the chaining of the steps\n in the pipeline is valid, a `Step` must also define the `inputs` and `outputs`\n properties:\n\n - `inputs`: a list of strings with the names of the columns that the step needs as\n input. It can be an empty list if the step is a generator step.\n - `outputs`: a list of strings with the names of the columns that the step will\n produce as output.\n\n Optionally, a `Step` can override the `load` method to perform any initialization\n logic before the `process` method is called. For example, to load an LLM, stablish a\n connection to a database, etc.\n\n Finally, the `Step` class inherits from `pydantic.BaseModel`, so attributes can be easily\n defined, validated, serialized and included in the `__init__` method of the step.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n\n name: Optional[str] = Field(default=None, pattern=r\"^[a-zA-Z0-9_-]+$\")\n resources: StepResources = StepResources()\n pipeline: Any = Field(default=None, exclude=True, repr=False)\n input_mappings: Dict[str, str] = {}\n output_mappings: Dict[str, str] = {}\n use_cache: bool = True\n\n _pipeline_artifacts_path: Path = PrivateAttr(None)\n _built_from_decorator: bool = PrivateAttr(default=False)\n _logger: \"Logger\" = PrivateAttr(None)\n\n def model_post_init(self, __context: Any) -> None:\n from distilabel.pipeline.base import _GlobalPipelineManager\n\n super().model_post_init(__context)\n\n if self.pipeline is None:\n self.pipeline = _GlobalPipelineManager.get_pipeline()\n\n if self.pipeline is None:\n _logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n _logger.warning(\n f\"Step '{self.name}' hasn't received a pipeline, and it hasn't been\"\n \" created within a `Pipeline` context. Please, use\"\n \" `with Pipeline() as pipeline:` and create the step within the context.\"\n )\n\n if not self.name:\n # This must be done before the check for repeated names, but assuming\n # we are passing the pipeline from the _GlobalPipelineManager, should\n # be done after that.\n self.name = _infer_step_name(type(self).__name__, self.pipeline)\n\n if self.pipeline is not None:\n # If not set an error will be raised in `Pipeline.run` parent\n self.pipeline._add_step(self)\n\n def connect(\n self,\n *steps: \"_Step\",\n routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n ) -> None:\n \"\"\"Connects the current step to another step in the pipeline, which means that\n the output of this step will be the input of the other step.\n\n Args:\n steps: The steps to connect to the current step.\n routing_batch_function: A function that receives a list of steps and returns\n a list of steps to which the output batch generated by this step should be\n routed. It should be used to define the routing logic of the pipeline. If\n not provided, the output batch will be routed to all the connected steps.\n Defaults to `None`.\n \"\"\"\n assert self.pipeline is not None\n\n if routing_batch_function:\n self._set_routing_batch_function(routing_batch_function)\n\n for step in steps:\n self.pipeline._add_edge(from_step=self.name, to_step=step.name) # type: ignore\n\n def _set_routing_batch_function(\n self, routing_batch_function: \"RoutingBatchFunction\"\n ) -> None:\n \"\"\"Sets a routing batch function for the batches generated by this step, so they\n get routed to specific downstream steps.\n\n Args:\n routing_batch_function: The routing batch function that will be used to route\n the batches generated by this step.\n \"\"\"\n self.pipeline._add_routing_batch_function(\n step_name=self.name, # type: ignore\n routing_batch_function=routing_batch_function,\n )\n routing_batch_function._step = self\n\n @overload\n def __rshift__(self, other: \"RoutingBatchFunction\") -> \"RoutingBatchFunction\": ...\n\n @overload\n def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n ) -> List[\"DownstreamConnectableSteps\"]: ...\n\n @overload\n def __rshift__(self, other: \"DownstreamConnectable\") -> \"DownstreamConnectable\": ...\n\n def __rshift__(\n self,\n other: Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ],\n ) -> Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ]:\n \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n Args:\n other: The step to connect, a list of steps to connect to or a routing batch\n function to be set for the step.\n\n Returns:\n The connected step, the list of connected steps or the routing batch function.\n\n Example:\n ```python\n step1 >> step2\n # Would be equivalent to:\n step1.connect(step2)\n\n # It also allows to connect a list of steps\n step1 >> [step2, step3]\n ```\n \"\"\"\n # Here to avoid circular imports\n from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n if isinstance(other, list):\n self.connect(*other)\n return other\n\n if isinstance(other, RoutingBatchFunction):\n self._set_routing_batch_function(other)\n return other\n\n self.connect(other)\n return other\n\n def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n to a single step, as the list doesn't have the __rshift__ operator.\n\n Args:\n other: The step to connect to.\n\n Returns:\n The connected step\n\n Example:\n ```python\n [step2, step3] >> step1\n # Would be equivalent to:\n step2.connect(step1)\n step3.connect(step1)\n ```\n \"\"\"\n for o in other:\n o.connect(self)\n return self\n\n def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n\n def unload(self) -> None:\n \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n example, to close a connection to a database, etc.\n \"\"\"\n self._logger.debug(\"Executing step unload logic.\")\n\n @property\n def is_generator(self) -> bool:\n \"\"\"Whether the step is a generator step or not.\n\n Returns:\n `True` if the step is a generator step, `False` otherwise.\n \"\"\"\n return isinstance(self, GeneratorStep)\n\n @property\n def is_global(self) -> bool:\n \"\"\"Whether the step is a global step or not.\n\n Returns:\n `True` if the step is a global step, `False` otherwise.\n \"\"\"\n return isinstance(self, GlobalStep)\n\n @property\n def is_normal(self) -> bool:\n \"\"\"Whether the step is a normal step or not.\n\n Returns:\n `True` if the step is a normal step, `False` otherwise.\n \"\"\"\n return not self.is_generator and not self.is_global\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of strings with the names of the mandatory columns that the step needs as\n input or dictionary in which the keys are the input columns of the step and the\n values are booleans indicating whether the column is optional or not.\n\n Returns:\n List of strings with the names of the columns that the step needs as input.\n \"\"\"\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of strings with the names of the columns that the step will produce as\n output or dictionary in which the keys are the output columns of the step and the\n values are booleans indicating whether the column is optional or not.\n\n Returns:\n List of strings with the names of the columns that the step will produce as\n output.\n \"\"\"\n return []\n\n @cached_property\n def process_parameters(self) -> List[inspect.Parameter]:\n \"\"\"Returns the parameters of the `process` method of the step.\n\n Returns:\n The parameters of the `process` method of the step.\n \"\"\"\n return list(inspect.signature(self.process).parameters.values()) # type: ignore\n\n def has_multiple_inputs(self) -> bool:\n \"\"\"Whether the `process` method of the step receives more than one input or not\n i.e. has a `*` argument annotated with `StepInput`.\n\n Returns:\n `True` if the `process` method of the step receives more than one input,\n `False` otherwise.\n \"\"\"\n return any(\n param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n )\n\n def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n \"\"\"Returns the parameter of the `process` method of the step annotated with\n `StepInput`.\n\n Returns:\n The parameter of the `process` method of the step annotated with `StepInput`,\n or `None` if there is no parameter annotated with `StepInput`.\n\n Raises:\n TypeError: If the step has more than one parameter annotated with `StepInput`.\n \"\"\"\n step_input_parameter = None\n for parameter in self.process_parameters:\n if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n if step_input_parameter is not None:\n raise DistilabelTypeError(\n f\"Step '{self.name}' should have only one parameter with type\"\n \" hint `StepInput`.\",\n page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n )\n step_input_parameter = parameter\n return step_input_parameter\n\n def verify_inputs_mappings(self) -> None:\n \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n columns exist in the inputs of the step.\n\n Raises:\n ValueError: If the `inputs_mappings` of the step are not valid.\n \"\"\"\n if not self.input_mappings:\n return\n\n for input in self.input_mappings:\n if input not in self.inputs:\n raise DistilabelUserError(\n f\"The input column '{input}' doesn't exist in the inputs of the\"\n f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n \" Please, review the `inputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n\n def verify_outputs_mappings(self) -> None:\n \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n columns exist in the outputs of the step.\n\n Raises:\n ValueError: If the `outputs_mappings` of the step are not valid.\n \"\"\"\n if not self.output_mappings:\n return\n\n for output in self.output_mappings:\n if output not in self.outputs:\n raise DistilabelUserError(\n f\"The output column '{output}' doesn't exist in the outputs of the\"\n f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n \" Please, review the `outputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n\n def get_inputs(self) -> Dict[str, bool]:\n \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n to be used to run validations on the inputs of the step.\n\n Returns:\n The inputs of the step after the `input_mappings` and if they are required or\n not.\n \"\"\"\n if isinstance(self.inputs, list):\n return {\n self.input_mappings.get(input, input): True for input in self.inputs\n }\n\n return {\n self.input_mappings.get(input, input): required\n for input, required in self.inputs.items()\n }\n\n def get_outputs(self) -> Dict[str, bool]:\n \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n meant to be used to run validations on the outputs of the step.\n\n Returns:\n The outputs of the step after the `outputs_mappings` and if they are required\n or not.\n \"\"\"\n if isinstance(self.outputs, list):\n return {\n self.output_mappings.get(output, output): True\n for output in self.outputs\n }\n\n return {\n self.output_mappings.get(output, output): required\n for output, required in self.outputs.items()\n }\n\n def set_pipeline_artifacts_path(self, path: Path) -> None:\n \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n by the `Pipeline` once the cache location is known.\n\n Args:\n path: the path where the artifacts generated by the pipeline steps should be\n saved.\n \"\"\"\n self._pipeline_artifacts_path = path\n\n @property\n def artifacts_directory(self) -> Union[Path, None]:\n \"\"\"Gets the path of the directory where the step should save its generated artifacts.\n\n Returns:\n The path of the directory where the step should save the generated artifacts,\n or `None` if `_pipeline_artifacts_path` is not set.\n \"\"\"\n if self._pipeline_artifacts_path is None:\n return None\n return self._pipeline_artifacts_path / self.name # type: ignore\n\n def save_artifact(\n self,\n name: str,\n write_function: Callable[[Path], None],\n metadata: Optional[Dict[str, Any]] = None,\n ) -> None:\n \"\"\"Saves an artifact generated by the `Step`.\n\n Args:\n name: the name of the artifact.\n write_function: a function that will receive the path where the artifact should\n be saved.\n metadata: the artifact metadata. Defaults to `None`.\n \"\"\"\n if self.artifacts_directory is None:\n self._logger.warning(\n f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n \" set. This is normal if the `Step` is being executed as a standalone component.\"\n )\n return\n\n artifact_directory_path = self.artifacts_directory / name\n artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n self._logger.debug(\n f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n )\n write_function(artifact_directory_path)\n\n metadata_path = artifact_directory_path / \"metadata.json\"\n self._logger.debug(\n f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n )\n write_json(filename=metadata_path, data=metadata or {})\n\n def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n ) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the output columns of the step that are not present in the step output.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n result.append(data)\n return result\n\n def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n dump = super()._model_dump(obj, **kwargs)\n dump[\"runtime_parameters_info\"] = self.get_runtime_parameters_info()\n return dump\n "},{"location":"api/step/#distilabel.steps.base._Step.is_generator","title":"is_generator: bool property ","text":"Whether the step is a generator step or not. Returns: Type Description bool True if the step is a generator step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.is_global","title":"is_global: bool property ","text":"Whether the step is a global step or not. Returns: Type Description bool True if the step is a global step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.is_normal","title":"is_normal: bool property ","text":"Whether the step is a normal step or not. Returns: Type Description bool True if the step is a normal step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.inputs","title":"inputs: StepColumns property ","text":"List of strings with the names of the mandatory columns that the step needs as input or dictionary in which the keys are the input columns of the step and the values are booleans indicating whether the column is optional or not. Returns: Type Description StepColumns List of strings with the names of the columns that the step needs as input. "},{"location":"api/step/#distilabel.steps.base._Step.outputs","title":"outputs: StepColumns property ","text":"List of strings with the names of the columns that the step will produce as output or dictionary in which the keys are the output columns of the step and the values are booleans indicating whether the column is optional or not. Returns: Type Description StepColumns List of strings with the names of the columns that the step will produce as StepColumns output. "},{"location":"api/step/#distilabel.steps.base._Step.process_parameters","title":"process_parameters: List[inspect.Parameter] cached property ","text":"Returns the parameters of the process method of the step. Returns: Type Description List[Parameter] The parameters of the process method of the step. "},{"location":"api/step/#distilabel.steps.base._Step.artifacts_directory","title":"artifacts_directory: Union[Path, None] property ","text":"Gets the path of the directory where the step should save its generated artifacts. Returns: Type Description Union[Path, None] The path of the directory where the step should save the generated artifacts, or None if _pipeline_artifacts_path is not set. "},{"location":"api/step/#distilabel.steps.base._Step.connect","title":"connect(*steps, routing_batch_function=None) ","text":"Connects the current step to another step in the pipeline, which means that the output of this step will be the input of the other step. Parameters: Name Type Description Default steps _Step The steps to connect to the current step. () routing_batch_function Optional[RoutingBatchFunction] A function that receives a list of steps and returns a list of steps to which the output batch generated by this step should be routed. It should be used to define the routing logic of the pipeline. If not provided, the output batch will be routed to all the connected steps. Defaults to None . None Source code in src/distilabel/steps/base.py def connect(\n self,\n *steps: \"_Step\",\n routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n) -> None:\n \"\"\"Connects the current step to another step in the pipeline, which means that\n the output of this step will be the input of the other step.\n\n Args:\n steps: The steps to connect to the current step.\n routing_batch_function: A function that receives a list of steps and returns\n a list of steps to which the output batch generated by this step should be\n routed. It should be used to define the routing logic of the pipeline. If\n not provided, the output batch will be routed to all the connected steps.\n Defaults to `None`.\n \"\"\"\n assert self.pipeline is not None\n\n if routing_batch_function:\n self._set_routing_batch_function(routing_batch_function)\n\n for step in steps:\n self.pipeline._add_edge(from_step=self.name, to_step=step.name) # type: ignore\n "},{"location":"api/step/#distilabel.steps.base._Step.__rshift__","title":"__rshift__(other) ","text":"__rshift__(other: RoutingBatchFunction) -> RoutingBatchFunction\n
__rshift__(other: List[DownstreamConnectableSteps]) -> List[DownstreamConnectableSteps]\n
__rshift__(other: DownstreamConnectable) -> DownstreamConnectable\n Allows using the >> operator to connect steps in the pipeline. Parameters: Name Type Description Default other Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]] The step to connect, a list of steps to connect to or a routing batch function to be set for the step. required Returns: Type Description Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]] The connected step, the list of connected steps or the routing batch function. Example step1 >> step2\n# Would be equivalent to:\nstep1.connect(step2)\n\n# It also allows to connect a list of steps\nstep1 >> [step2, step3]\n Source code in src/distilabel/steps/base.py def __rshift__(\n self,\n other: Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ],\n) -> Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n]:\n \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n Args:\n other: The step to connect, a list of steps to connect to or a routing batch\n function to be set for the step.\n\n Returns:\n The connected step, the list of connected steps or the routing batch function.\n\n Example:\n ```python\n step1 >> step2\n # Would be equivalent to:\n step1.connect(step2)\n\n # It also allows to connect a list of steps\n step1 >> [step2, step3]\n ```\n \"\"\"\n # Here to avoid circular imports\n from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n if isinstance(other, list):\n self.connect(*other)\n return other\n\n if isinstance(other, RoutingBatchFunction):\n self._set_routing_batch_function(other)\n return other\n\n self.connect(other)\n return other\n "},{"location":"api/step/#distilabel.steps.base._Step.__rrshift__","title":"__rrshift__(other) ","text":"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline to a single step, as the list doesn't have the rshift operator. Parameters: Name Type Description Default other List[UpstreamConnectableSteps] The step to connect to. required Returns: Type Description Self The connected step Example [step2, step3] >> step1\n# Would be equivalent to:\nstep2.connect(step1)\nstep3.connect(step1)\n Source code in src/distilabel/steps/base.py def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n to a single step, as the list doesn't have the __rshift__ operator.\n\n Args:\n other: The step to connect to.\n\n Returns:\n The connected step\n\n Example:\n ```python\n [step2, step3] >> step1\n # Would be equivalent to:\n step2.connect(step1)\n step3.connect(step1)\n ```\n \"\"\"\n for o in other:\n o.connect(self)\n return self\n "},{"location":"api/step/#distilabel.steps.base._Step.load","title":"load() ","text":"Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Source code in src/distilabel/steps/base.py def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n "},{"location":"api/step/#distilabel.steps.base._Step.unload","title":"unload() ","text":"Method to perform any cleanup logic after the process method is called. For example, to close a connection to a database, etc. Source code in src/distilabel/steps/base.py def unload(self) -> None:\n \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n example, to close a connection to a database, etc.\n \"\"\"\n self._logger.debug(\"Executing step unload logic.\")\n "},{"location":"api/step/#distilabel.steps.base._Step.has_multiple_inputs","title":"has_multiple_inputs() ","text":"Whether the process method of the step receives more than one input or not i.e. has a * argument annotated with StepInput . Returns: Type Description bool True if the process method of the step receives more than one input, bool False otherwise. Source code in src/distilabel/steps/base.py def has_multiple_inputs(self) -> bool:\n \"\"\"Whether the `process` method of the step receives more than one input or not\n i.e. has a `*` argument annotated with `StepInput`.\n\n Returns:\n `True` if the `process` method of the step receives more than one input,\n `False` otherwise.\n \"\"\"\n return any(\n param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.get_process_step_input","title":"get_process_step_input() ","text":"Returns the parameter of the process method of the step annotated with StepInput . Returns: Type Description Union[Parameter, None] The parameter of the process method of the step annotated with StepInput , Union[Parameter, None] or None if there is no parameter annotated with StepInput . Raises: Type Description TypeError If the step has more than one parameter annotated with StepInput . Source code in src/distilabel/steps/base.py def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n \"\"\"Returns the parameter of the `process` method of the step annotated with\n `StepInput`.\n\n Returns:\n The parameter of the `process` method of the step annotated with `StepInput`,\n or `None` if there is no parameter annotated with `StepInput`.\n\n Raises:\n TypeError: If the step has more than one parameter annotated with `StepInput`.\n \"\"\"\n step_input_parameter = None\n for parameter in self.process_parameters:\n if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n if step_input_parameter is not None:\n raise DistilabelTypeError(\n f\"Step '{self.name}' should have only one parameter with type\"\n \" hint `StepInput`.\",\n page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n )\n step_input_parameter = parameter\n return step_input_parameter\n "},{"location":"api/step/#distilabel.steps.base._Step.verify_inputs_mappings","title":"verify_inputs_mappings() ","text":"Verifies that the inputs_mappings of the step are valid i.e. the input columns exist in the inputs of the step. Raises: Type Description ValueError If the inputs_mappings of the step are not valid. Source code in src/distilabel/steps/base.py def verify_inputs_mappings(self) -> None:\n \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n columns exist in the inputs of the step.\n\n Raises:\n ValueError: If the `inputs_mappings` of the step are not valid.\n \"\"\"\n if not self.input_mappings:\n return\n\n for input in self.input_mappings:\n if input not in self.inputs:\n raise DistilabelUserError(\n f\"The input column '{input}' doesn't exist in the inputs of the\"\n f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n \" Please, review the `inputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.verify_outputs_mappings","title":"verify_outputs_mappings() ","text":"Verifies that the outputs_mappings of the step are valid i.e. the output columns exist in the outputs of the step. Raises: Type Description ValueError If the outputs_mappings of the step are not valid. Source code in src/distilabel/steps/base.py def verify_outputs_mappings(self) -> None:\n \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n columns exist in the outputs of the step.\n\n Raises:\n ValueError: If the `outputs_mappings` of the step are not valid.\n \"\"\"\n if not self.output_mappings:\n return\n\n for output in self.output_mappings:\n if output not in self.outputs:\n raise DistilabelUserError(\n f\"The output column '{output}' doesn't exist in the outputs of the\"\n f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n \" Please, review the `outputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.get_inputs","title":"get_inputs() ","text":"Gets the inputs of the step after the input_mappings . This method is meant to be used to run validations on the inputs of the step. Returns: Type Description Dict[str, bool] The inputs of the step after the input_mappings and if they are required or Dict[str, bool] not. Source code in src/distilabel/steps/base.py def get_inputs(self) -> Dict[str, bool]:\n \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n to be used to run validations on the inputs of the step.\n\n Returns:\n The inputs of the step after the `input_mappings` and if they are required or\n not.\n \"\"\"\n if isinstance(self.inputs, list):\n return {\n self.input_mappings.get(input, input): True for input in self.inputs\n }\n\n return {\n self.input_mappings.get(input, input): required\n for input, required in self.inputs.items()\n }\n "},{"location":"api/step/#distilabel.steps.base._Step.get_outputs","title":"get_outputs() ","text":"Gets the outputs of the step after the outputs_mappings . This method is meant to be used to run validations on the outputs of the step. Returns: Type Description Dict[str, bool] The outputs of the step after the outputs_mappings and if they are required Dict[str, bool] or not. Source code in src/distilabel/steps/base.py def get_outputs(self) -> Dict[str, bool]:\n \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n meant to be used to run validations on the outputs of the step.\n\n Returns:\n The outputs of the step after the `outputs_mappings` and if they are required\n or not.\n \"\"\"\n if isinstance(self.outputs, list):\n return {\n self.output_mappings.get(output, output): True\n for output in self.outputs\n }\n\n return {\n self.output_mappings.get(output, output): required\n for output, required in self.outputs.items()\n }\n "},{"location":"api/step/#distilabel.steps.base._Step.set_pipeline_artifacts_path","title":"set_pipeline_artifacts_path(path) ","text":"Sets the _pipeline_artifacts_path attribute. This method is meant to be used by the Pipeline once the cache location is known. Parameters: Name Type Description Default path Path the path where the artifacts generated by the pipeline steps should be saved. required Source code in src/distilabel/steps/base.py def set_pipeline_artifacts_path(self, path: Path) -> None:\n \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n by the `Pipeline` once the cache location is known.\n\n Args:\n path: the path where the artifacts generated by the pipeline steps should be\n saved.\n \"\"\"\n self._pipeline_artifacts_path = path\n "},{"location":"api/step/#distilabel.steps.base._Step.save_artifact","title":"save_artifact(name, write_function, metadata=None) ","text":"Saves an artifact generated by the Step . Parameters: Name Type Description Default name str the name of the artifact. required write_function Callable[[Path], None] a function that will receive the path where the artifact should be saved. required metadata Optional[Dict[str, Any]] the artifact metadata. Defaults to None . None Source code in src/distilabel/steps/base.py def save_artifact(\n self,\n name: str,\n write_function: Callable[[Path], None],\n metadata: Optional[Dict[str, Any]] = None,\n) -> None:\n \"\"\"Saves an artifact generated by the `Step`.\n\n Args:\n name: the name of the artifact.\n write_function: a function that will receive the path where the artifact should\n be saved.\n metadata: the artifact metadata. Defaults to `None`.\n \"\"\"\n if self.artifacts_directory is None:\n self._logger.warning(\n f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n \" set. This is normal if the `Step` is being executed as a standalone component.\"\n )\n return\n\n artifact_directory_path = self.artifacts_directory / name\n artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n self._logger.debug(\n f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n )\n write_function(artifact_directory_path)\n\n metadata_path = artifact_directory_path / \"metadata.json\"\n self._logger.debug(\n f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n )\n write_json(filename=metadata_path, data=metadata or {})\n "},{"location":"api/step/#distilabel.steps.base._Step.impute_step_outputs","title":"impute_step_outputs(step_output) ","text":"Imputes the output columns of the step that are not present in the step output. Source code in src/distilabel/steps/base.py def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the output columns of the step that are not present in the step output.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n result.append(data)\n return result\n "},{"location":"api/step/#distilabel.steps.base.Step","title":"Step ","text":" Bases: _Step , ABC Base class for the steps that can be included in a Pipeline . Attributes: Name Type Description input_batch_size RuntimeParameter[PositiveInt] The number of rows that will contain the batches processed by the step. Defaults to 50 . Runtime parameters input_batch_size : The number of rows that will contain the batches processed by the step. Defaults to 50 . Source code in src/distilabel/steps/base.py class Step(_Step, ABC):\n \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n Attributes:\n input_batch_size: The number of rows that will contain the batches processed by\n the step. Defaults to `50`.\n\n Runtime parameters:\n - `input_batch_size`: The number of rows that will contain the batches processed\n by the step. Defaults to `50`.\n \"\"\"\n\n input_batch_size: RuntimeParameter[PositiveInt] = Field(\n default=DEFAULT_INPUT_BATCH_SIZE,\n description=\"The number of rows that will contain the batches processed by the\"\n \" step.\",\n )\n\n @abstractmethod\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"Method that defines the processing logic of the step. It should yield the\n output rows.\n\n Args:\n *inputs: An argument used to receive the outputs of the previous steps. The\n number of arguments depends on the number of previous steps. It doesn't\n need to be an `*args` argument, it can be a regular argument annotated\n with `StepInput` if the step has only one previous step.\n \"\"\"\n pass\n\n def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n rows and the `outputs_mappings` to the output rows. This is the function that\n should be used to run the processing logic of the step.\n\n Yields:\n The output rows.\n \"\"\"\n\n inputs, overriden_inputs = (\n self._apply_input_mappings(args)\n if self.input_mappings\n else (args, [{} for _ in range(len(args[0]))])\n )\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as kwargs, so they can be used within the processing\n # function\n generator = (\n self.process(*inputs)\n if not self._built_from_decorator\n else self.process(*inputs, **self._runtime_parameters)\n )\n\n for output_rows in generator:\n restored = []\n for i, row in enumerate(output_rows):\n # Correct the index here because we don't know the num_generations from the llm\n # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n # from `num_generations==2` and `group_generations=False` in the LLM:\n # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n ntimes_i = i % len(overriden_inputs)\n restored.append(\n self._apply_mappings_and_restore_overriden(\n row, overriden_inputs[ntimes_i]\n )\n )\n yield restored\n\n def _apply_input_mappings(\n self, inputs: Tuple[List[Dict[str, Any]], ...]\n ) -> Tuple[Tuple[List[Dict[str, Any]], ...], List[Dict[str, Any]]]:\n \"\"\"Applies the `input_mappings` to the input rows.\n\n Args:\n inputs: The input rows.\n\n Returns:\n The input rows with the `input_mappings` applied and the overriden values\n that were replaced by the `input_mappings`.\n \"\"\"\n reverted_input_mappings = {v: k for k, v in self.input_mappings.items()}\n\n renamed_inputs = []\n overriden_inputs = []\n for i, row_inputs in enumerate(inputs):\n renamed_row_inputs = []\n for row in row_inputs:\n overriden_keys = {}\n renamed_row = {}\n for k, v in row.items():\n renamed_key = reverted_input_mappings.get(k, k)\n\n if renamed_key not in renamed_row or k != renamed_key:\n renamed_row[renamed_key] = v\n\n if k != renamed_key and renamed_key in row and len(inputs) == 1:\n overriden_keys[renamed_key] = row[renamed_key]\n\n if i == 0:\n overriden_inputs.append(overriden_keys)\n renamed_row_inputs.append(renamed_row)\n renamed_inputs.append(renamed_row_inputs)\n return tuple(renamed_inputs), overriden_inputs\n\n def _apply_mappings_and_restore_overriden(\n self, row: Dict[str, Any], overriden: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Reverts the `input_mappings` applied to the input rows and applies the `output_mappings`\n to the output rows. In addition, it restores the overriden values that were replaced\n by the `input_mappings`.\n\n Args:\n row: The output row.\n overriden: The overriden values that were replaced by the `input_mappings`.\n\n Returns:\n The output row with the `output_mappings` applied and the overriden values\n restored.\n \"\"\"\n result = {}\n for k, v in row.items():\n mapped_key = (\n self.output_mappings.get(k, None)\n or self.input_mappings.get(k, None)\n or k\n )\n result[mapped_key] = v\n\n # Restore overriden values\n for k, v in overriden.items():\n if k not in result:\n result[k] = v\n\n return result\n "},{"location":"api/step/#distilabel.steps.base.Step.process","title":"process(*inputs) abstractmethod ","text":"Method that defines the processing logic of the step. It should yield the output rows. Parameters: Name Type Description Default *inputs StepInput An argument used to receive the outputs of the previous steps. The number of arguments depends on the number of previous steps. It doesn't need to be an *args argument, it can be a regular argument annotated with StepInput if the step has only one previous step. () Source code in src/distilabel/steps/base.py @abstractmethod\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"Method that defines the processing logic of the step. It should yield the\n output rows.\n\n Args:\n *inputs: An argument used to receive the outputs of the previous steps. The\n number of arguments depends on the number of previous steps. It doesn't\n need to be an `*args` argument, it can be a regular argument annotated\n with `StepInput` if the step has only one previous step.\n \"\"\"\n pass\n "},{"location":"api/step/#distilabel.steps.base.Step.process_applying_mappings","title":"process_applying_mappings(*args) ","text":"Runs the process method of the step applying the input_mappings to the input rows and the outputs_mappings to the output rows. This is the function that should be used to run the processing logic of the step. Yields: Type Description StepOutput The output rows. Source code in src/distilabel/steps/base.py def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n rows and the `outputs_mappings` to the output rows. This is the function that\n should be used to run the processing logic of the step.\n\n Yields:\n The output rows.\n \"\"\"\n\n inputs, overriden_inputs = (\n self._apply_input_mappings(args)\n if self.input_mappings\n else (args, [{} for _ in range(len(args[0]))])\n )\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as kwargs, so they can be used within the processing\n # function\n generator = (\n self.process(*inputs)\n if not self._built_from_decorator\n else self.process(*inputs, **self._runtime_parameters)\n )\n\n for output_rows in generator:\n restored = []\n for i, row in enumerate(output_rows):\n # Correct the index here because we don't know the num_generations from the llm\n # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n # from `num_generations==2` and `group_generations=False` in the LLM:\n # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n ntimes_i = i % len(overriden_inputs)\n restored.append(\n self._apply_mappings_and_restore_overriden(\n row, overriden_inputs[ntimes_i]\n )\n )\n yield restored\n "},{"location":"api/step/decorator/","title":"@step","text":"This section contains the reference for the @step decorator, used to create new Step subclasses without having to manually define the class. For more information check the Tutorial - Step page. "},{"location":"api/step/decorator/#distilabel.steps.decorator","title":"decorator ","text":""},{"location":"api/step/decorator/#distilabel.steps.decorator.step","title":"step(inputs=None, outputs=None, step_type='normal') ","text":"step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['normal'] = 'normal') -> Callable[..., Type[Step]]\n
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['global'] = 'global') -> Callable[..., Type[GlobalStep]]\n
step(inputs: None = None, outputs: Union[StepColumns, None] = None, step_type: Literal['generator'] = 'generator') -> Callable[..., Type[GeneratorStep]]\n Creates an Step from a processing function. Parameters: Name Type Description Default inputs Union[StepColumns, None] a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None outputs Union[StepColumns, None] a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None step_type Literal['normal', 'global', 'generator'] the kind of step to create. Valid choices are: \"normal\" (Step ), \"global\" (GlobalStep ) or \"generator\" (GeneratorStep ). Defaults to \"normal\" . 'normal' Returns: Type Description Callable[..., Type[_Step]] A callable that will generate the type given the processing function. Example: # Normal step\n@step(inputs=[\"instruction\"], outputs=[\"generation\"])\ndef GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n for input in inputs:\n input[\"generation\"] = dummy_generation\n yield inputs\n\n# Global step\n@step(inputs=[\"instruction\"], step_type=\"global\")\ndef FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n yield [\n input\n for input in inputs\n if len(input[\"instruction\"]) <= max_length\n ]\n\n# Generator step\n@step(outputs=[\"num\"], step_type=\"generator\")\ndef RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n data = list(range(num_rows))\n for i in range(0, len(data), 100):\n last_batch = i + 100 >= len(data)\n yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n Source code in src/distilabel/steps/decorator.py def step(\n inputs: Union[\"StepColumns\", None] = None,\n outputs: Union[\"StepColumns\", None] = None,\n step_type: Literal[\"normal\", \"global\", \"generator\"] = \"normal\",\n) -> Callable[..., Type[\"_Step\"]]:\n \"\"\"Creates an `Step` from a processing function.\n\n Args:\n inputs: a list containing the name of the inputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column is required or not, that are required by the step. If not provided\n the default will be an empty list `[]` and it will be assumed that the step\n doesn't need any specific columns. Defaults to `None`.\n outputs: a list containing the name of the outputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column will be generated or not. If not provided the default will be an\n empty list `[]` and it will be assumed that the step doesn't need any specific\n columns. Defaults to `None`.\n step_type: the kind of step to create. Valid choices are: \"normal\" (`Step`),\n \"global\" (`GlobalStep`) or \"generator\" (`GeneratorStep`). Defaults to\n `\"normal\"`.\n\n Returns:\n A callable that will generate the type given the processing function.\n\n Example:\n\n ```python\n # Normal step\n @step(inputs=[\"instruction\"], outputs=[\"generation\"])\n def GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n for input in inputs:\n input[\"generation\"] = dummy_generation\n yield inputs\n\n # Global step\n @step(inputs=[\"instruction\"], step_type=\"global\")\n def FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n yield [\n input\n for input in inputs\n if len(input[\"instruction\"]) <= max_length\n ]\n\n # Generator step\n @step(outputs=[\"num\"], step_type=\"generator\")\n def RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n data = list(range(num_rows))\n for i in range(0, len(data), 100):\n last_batch = i + 100 >= len(data)\n yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n ```\n \"\"\"\n\n inputs = inputs or []\n outputs = outputs or []\n\n def decorator(func: ProcessingFunc) -> Type[\"_Step\"]:\n if step_type not in _STEP_MAPPING:\n raise ValueError(\n f\"Invalid step type '{step_type}'. Please, review the '{func.__name__}'\"\n \" function decorated with the `@step` decorator and provide a valid\"\n \" `step_type`. Valid choices are: 'normal', 'global' or 'generator'.\"\n )\n\n BaseClass = _STEP_MAPPING[step_type]\n\n signature = inspect.signature(func)\n\n runtime_parameters = {\n name: (\n param.annotation,\n param.default if param.default != param.empty else None,\n )\n for name, param in signature.parameters.items()\n }\n\n runtime_parameters = {}\n step_input_parameter = None\n for name, param in signature.parameters.items():\n if is_parameter_annotated_with(param, _RUNTIME_PARAMETER_ANNOTATION):\n runtime_parameters[name] = (\n param.annotation,\n param.default if param.default != param.empty else None,\n )\n\n if not step_type == \"generator\" and is_parameter_annotated_with(\n param, _STEP_INPUT_ANNOTATION\n ):\n if step_input_parameter is not None:\n raise ValueError(\n f\"Function '{func.__name__}' has more than one parameter annotated\"\n f\" with `StepInput`. Please, review the '{func.__name__}' function\"\n \" decorated with the `@step` decorator and provide only one\"\n \" argument annotated with `StepInput`.\"\n )\n step_input_parameter = param\n\n RuntimeParametersModel = create_model( # type: ignore\n \"RuntimeParametersModel\",\n **runtime_parameters, # type: ignore\n )\n\n def inputs_property(self) -> \"StepColumns\":\n return inputs\n\n def outputs_property(self) -> \"StepColumns\":\n return outputs\n\n def process(\n self, *args: Any, **kwargs: Any\n ) -> Union[\"StepOutput\", \"GeneratorStepOutput\"]:\n return func(*args, **kwargs)\n\n return type( # type: ignore\n func.__name__,\n (\n BaseClass,\n RuntimeParametersModel,\n ),\n {\n \"process\": process,\n \"inputs\": property(inputs_property),\n \"outputs\": property(outputs_property),\n \"__module__\": func.__module__,\n \"__doc__\": func.__doc__,\n \"_built_from_decorator\": True,\n # Override the `get_process_step_input` method to return the parameter\n # of the original function annotated with `StepInput`.\n \"get_process_step_input\": lambda self: step_input_parameter,\n },\n )\n\n return decorator\n "},{"location":"api/step/generator_step/","title":"GeneratorStep","text":"This section contains the API reference for the GeneratorStep class. For more information and examples on how to use existing generator steps or create custom ones, please refer to Tutorial - Step - GeneratorStep. "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep","title":"GeneratorStep ","text":" Bases: _Step , ABC A special kind of Step that is able to generate data i.e. it doesn't receive any input from the previous steps. Attributes: Name Type Description batch_size RuntimeParameter[int] The number of rows that will contain the batches generated by the step. Defaults to 50 . Runtime parameters batch_size : The number of rows that will contain the batches generated by the step. Defaults to 50 . Source code in src/distilabel/steps/base.py class GeneratorStep(_Step, ABC):\n \"\"\"A special kind of `Step` that is able to generate data i.e. it doesn't receive\n any input from the previous steps.\n\n Attributes:\n batch_size: The number of rows that will contain the batches generated by the\n step. Defaults to `50`.\n\n Runtime parameters:\n - `batch_size`: The number of rows that will contain the batches generated by\n the step. Defaults to `50`.\n \"\"\"\n\n batch_size: RuntimeParameter[int] = Field(\n default=50,\n description=\"The number of rows that will contain the batches generated by the\"\n \" step.\",\n )\n\n @abstractmethod\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Method that defines the generation logic of the step. It should yield the\n output rows and a boolean indicating if it's the last batch or not.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n pass\n\n def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n output rows. This is the function that should be used to run the generation logic\n of the step.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as `kwargs`, so they can be used within the processing\n # function\n generator = (\n self.process(offset=offset)\n if not self._built_from_decorator\n else self.process(offset=offset, **self._runtime_parameters)\n )\n\n for output_rows, last_batch in generator:\n yield (\n [\n {self.output_mappings.get(k, k): v for k, v in row.items()}\n for row in output_rows\n ],\n last_batch,\n )\n "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process","title":"process(offset=0) abstractmethod ","text":"Method that defines the generation logic of the step. It should yield the output rows and a boolean indicating if it's the last batch or not. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput The output rows and a boolean indicating if it's the last batch or not. Source code in src/distilabel/steps/base.py @abstractmethod\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Method that defines the generation logic of the step. It should yield the\n output rows and a boolean indicating if it's the last batch or not.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n pass\n "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process_applying_mappings","title":"process_applying_mappings(offset=0) ","text":"Runs the process method of the step applying the outputs_mappings to the output rows. This is the function that should be used to run the generation logic of the step. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput The output rows and a boolean indicating if it's the last batch or not. Source code in src/distilabel/steps/base.py def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n output rows. This is the function that should be used to run the generation logic\n of the step.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as `kwargs`, so they can be used within the processing\n # function\n generator = (\n self.process(offset=offset)\n if not self._built_from_decorator\n else self.process(offset=offset, **self._runtime_parameters)\n )\n\n for output_rows, last_batch in generator:\n yield (\n [\n {self.output_mappings.get(k, k): v for k, v in row.items()}\n for row in output_rows\n ],\n last_batch,\n )\n "},{"location":"api/step/generator_step/#distilabel.steps.generators.utils.make_generator_step","title":"make_generator_step(dataset, pipeline=None, batch_size=50, input_mappings=None, output_mappings=None, resources=StepResources(), repo_id='default_name') ","text":"Helper method to create a GeneratorStep from a dataset, to simplify Parameters: Name Type Description Default dataset Union[Dataset, DataFrame, List[Dict[str, str]]] The dataset to use in the Pipeline . required batch_size int The batch_size, will default to the same used by the GeneratorStep s. Defaults to 50 . 50 input_mappings Optional[Dict[str, str]] Applies the same as any other step. Defaults to None . None output_mappings Optional[Dict[str, str]] Applies the same as any other step. Defaults to None . None resources StepResources Applies the same as any other step. Defaults to StepResources() . StepResources() repo_id Optional[str] The repository ID to use in the LoadDataFromHub step. This shouldn't be necessary, but in case of error, the dataset will try to be loaded using load_dataset internally. If that case happens, the repo_id will be used. 'default_name' Raises: Type Description ValueError If the format is different from the ones supported. Returns: Type Description GeneratorStep A LoadDataFromDicts if the input is a list of dicts, or LoadDataFromHub instance GeneratorStep if the input is a pd.DataFrame or a Dataset . Source code in src/distilabel/steps/generators/utils.py def make_generator_step(\n dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]],\n pipeline: Union[\"BasePipeline\", None] = None,\n batch_size: int = 50,\n input_mappings: Optional[Dict[str, str]] = None,\n output_mappings: Optional[Dict[str, str]] = None,\n resources: StepResources = StepResources(),\n repo_id: Optional[str] = \"default_name\",\n) -> \"GeneratorStep\":\n \"\"\"Helper method to create a `GeneratorStep` from a dataset, to simplify\n\n Args:\n dataset: The dataset to use in the `Pipeline`.\n batch_size: The batch_size, will default to the same used by the `GeneratorStep`s.\n Defaults to `50`.\n input_mappings: Applies the same as any other step. Defaults to `None`.\n output_mappings: Applies the same as any other step. Defaults to `None`.\n resources: Applies the same as any other step. Defaults to `StepResources()`.\n repo_id: The repository ID to use in the `LoadDataFromHub` step.\n This shouldn't be necessary, but in case of error, the dataset will try to be loaded\n using `load_dataset` internally. If that case happens, the `repo_id` will be used.\n\n Raises:\n ValueError: If the format is different from the ones supported.\n\n Returns:\n A `LoadDataFromDicts` if the input is a list of dicts, or `LoadDataFromHub` instance\n if the input is a `pd.DataFrame` or a `Dataset`.\n \"\"\"\n from distilabel.steps import LoadDataFromDicts, LoadDataFromHub\n\n if isinstance(dataset, list):\n return LoadDataFromDicts(\n pipeline=pipeline,\n data=dataset,\n batch_size=batch_size,\n input_mappings=input_mappings or {},\n output_mappings=output_mappings or {},\n resources=resources,\n )\n\n if isinstance(dataset, pd.DataFrame):\n dataset = Dataset.from_pandas(dataset, preserve_index=False)\n\n if not isinstance(dataset, Dataset):\n raise DistilabelUserError(\n f\"Dataset type not allowed: {type(dataset)}, must be one of: \"\n \"`datasets.Dataset`, `pd.DataFrame`, `List[Dict[str, str]]`\",\n page=\"sections/how_to_guides/basic/pipeline/?h=make_#__tabbed_1_2\",\n )\n\n loader = LoadDataFromHub(\n pipeline=pipeline,\n repo_id=repo_id,\n batch_size=batch_size,\n input_mappings=input_mappings or {},\n output_mappings=output_mappings or {},\n resources=resources,\n )\n super(loader.__class__, loader).load() # Ensure the logger is loaded\n loader._dataset = dataset\n loader.num_examples = len(dataset)\n loader._dataset_info = {\"default\": dataset.info}\n return loader\n "},{"location":"api/step/global_step/","title":"GlobalStep","text":"This section contains the API reference for the GlobalStep class. For more information and examples on how to use existing global steps or create custom ones, please refer to Tutorial - Step - GlobalStep. "},{"location":"api/step/global_step/#distilabel.steps.base.GlobalStep","title":"GlobalStep ","text":" Bases: Step , ABC A special kind of Step which it's process method receives all the data processed by their previous steps at once, instead of receiving it in batches. This kind of steps are useful when the processing logic requires to have all the data at once, for example to train a model, to perform a global aggregation, etc. Source code in src/distilabel/steps/base.py class GlobalStep(Step, ABC):\n \"\"\"A special kind of `Step` which it's `process` method receives all the data processed\n by their previous steps at once, instead of receiving it in batches. This kind of steps\n are useful when the processing logic requires to have all the data at once, for example\n to train a model, to perform a global aggregation, etc.\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n return []\n "},{"location":"api/step/resources/","title":"StepResources","text":""},{"location":"api/step/resources/#distilabel.steps.base.StepResources","title":"StepResources ","text":" Bases: RuntimeParametersMixin , BaseModel A class to define the resources assigned to a _Step . Attributes: Name Type Description replicas RuntimeParameter[PositiveInt] The number of replicas for the step. cpus Optional[RuntimeParameter[PositiveInt]] The number of CPUs assigned to each step replica. gpus Optional[RuntimeParameter[PositiveInt]] The number of GPUs assigned to each step replica. memory Optional[RuntimeParameter[PositiveInt]] The memory in bytes required for each step replica. resources Optional[RuntimeParameter[Dict[str, int]]] A dictionary containing the number of custom resources required for each step replica. Source code in src/distilabel/steps/base.py class StepResources(RuntimeParametersMixin, BaseModel):\n \"\"\"A class to define the resources assigned to a `_Step`.\n\n Attributes:\n replicas: The number of replicas for the step.\n cpus: The number of CPUs assigned to each step replica.\n gpus: The number of GPUs assigned to each step replica.\n memory: The memory in bytes required for each step replica.\n resources: A dictionary containing the number of custom resources required for\n each step replica.\n \"\"\"\n\n replicas: RuntimeParameter[PositiveInt] = Field(\n default=1, description=\"The number of replicas for the step.\"\n )\n cpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The number of CPUs assigned to each step replica.\"\n )\n gpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The number of GPUs assigned to each step replica.\"\n )\n memory: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The memory in bytes required for each step replica.\"\n )\n resources: Optional[RuntimeParameter[Dict[str, int]]] = Field(\n default=None,\n description=\"A dictionary containing names of custom resources and the\"\n \" number of those resources required for each step replica.\",\n )\n "},{"location":"api/step/typing/","title":"Step Typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing","title":"typing ","text":""},{"location":"api/step/typing/#distilabel.steps.typing.StepOutput","title":"StepOutput = Iterator[List[Dict[str, Any]]] module-attribute ","text":"StepOutput is an alias of the typing Iterator[List[Dict[str, Any]]] "},{"location":"api/step/typing/#distilabel.steps.typing.GeneratorStepOutput","title":"GeneratorStepOutput = Iterator[Tuple[List[Dict[str, Any]], bool]] module-attribute ","text":"GeneratorStepOutput is an alias of the typing Iterator[Tuple[List[Dict[str, Any]], bool]] "},{"location":"api/step/typing/#distilabel.steps.typing.StepColumns","title":"StepColumns = Union[List[str], Dict[str, bool]] module-attribute ","text":"StepColumns is an alias of the typing Union[List[str], Dict[str, bool]] used by the inputs and outputs properties of an Step . In the case of a List[str] , it is a list with the required columns. In the case of a Dict[str, bool] , it is a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not. "},{"location":"api/step_gallery/argilla/","title":"Argilla","text":"This section contains the existing steps integrated with Argilla so as to easily push the generated datasets to Argilla. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base","title":"base ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase","title":"ArgillaBase ","text":" Bases: Step , ABC Abstract step that provides a class to subclass from, that contains the boilerplate code required to interact with Argilla, as well as some extra validations on top of it. It also defines the abstract methods that need to be implemented in order to add a new dataset type as a step. Note This class is not intended to be instanced directly, but via subclass. Attributes: Name Type Description dataset_name RuntimeParameter[str] The name of the dataset in Argilla where the records will be added. dataset_workspace Optional[RuntimeParameter[str]] The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url Optional[RuntimeParameter[str]] The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key Optional[RuntimeParameter[SecretStr]] The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters dataset_name : The name of the dataset in Argilla where the records will be added. dataset_workspace : The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - dynamic, based on the
inputs value provided Source code in src/distilabel/steps/argilla/base.py class ArgillaBase(Step, ABC):\n \"\"\"Abstract step that provides a class to subclass from, that contains the boilerplate code\n required to interact with Argilla, as well as some extra validations on top of it. It also defines\n the abstract methods that need to be implemented in order to add a new dataset type as a step.\n\n Note:\n This class is not intended to be instanced directly, but via subclass.\n\n Attributes:\n dataset_name: The name of the dataset in Argilla where the records will be added.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `dataset_name`: The name of the dataset in Argilla where the records will be\n added.\n - `dataset_workspace`: The workspace where the dataset will be created in Argilla.\n Defaults to `None`, which means it will be created in the default workspace.\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - dynamic, based on the `inputs` value provided\n \"\"\"\n\n dataset_name: RuntimeParameter[str] = Field(\n default=None, description=\"The name of the dataset in Argilla.\"\n )\n dataset_workspace: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The workspace where the dataset will be created in Argilla. Defaults \"\n \"to `None` which means it will be created in the default workspace.\",\n )\n\n api_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(_ARGILLA_API_URL_ENV_VAR_NAME),\n description=\"The base URL to use for the Argilla API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ARGILLA_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Argilla API.\",\n )\n\n _client: Optional[\"Argilla\"] = PrivateAttr(...)\n _dataset: Optional[\"Dataset\"] = PrivateAttr(...)\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n super().model_post_init(__context)\n\n if importlib.util.find_spec(\"argilla\") is None:\n raise ImportError(\n \"Argilla is not installed. Please install it using `pip install argilla\"\n \" --upgrade`.\"\n )\n\n def _client_init(self) -> None:\n \"\"\"Initializes the Argilla API client with the provided `api_url` and `api_key`.\"\"\"\n try:\n self._client = rg.Argilla( # type: ignore\n api_url=self.api_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n headers={\"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\"}\n if isinstance(self.api_url, str)\n and \"hf.space\" in self.api_url\n and \"HF_TOKEN\" in os.environ\n else {},\n )\n except Exception as e:\n raise DistilabelUserError(\n f\"Failed to initialize the Argilla API: {e}\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n ) from e\n\n @property\n def _dataset_exists_in_workspace(self) -> bool:\n \"\"\"Checks if the dataset already exists in Argilla in the provided workspace if any.\n\n Returns:\n `True` if the dataset exists, `False` otherwise.\n \"\"\"\n return (\n self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace,\n )\n is not None\n )\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs of the step is an empty list, since the steps subclassing from this one, will\n always be leaf nodes and won't propagate the inputs neither generate any outputs.\n \"\"\"\n return []\n\n def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n super().load()\n\n if self.api_url is None or self.api_key is None:\n raise DistilabelUserError(\n \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n )\n\n self._client_init()\n\n @property\n @abstractmethod\n def inputs(self) -> \"StepColumns\": ...\n\n @abstractmethod\n def process(self, *inputs: StepInput) -> \"StepOutput\": ...\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.outputs","title":"outputs: StepColumns property ","text":"The outputs of the step is an empty list, since the steps subclassing from this one, will always be leaf nodes and won't propagate the inputs neither generate any outputs. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.model_post_init","title":"model_post_init(__context) ","text":"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings. Source code in src/distilabel/steps/argilla/base.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n super().model_post_init(__context)\n\n if importlib.util.find_spec(\"argilla\") is None:\n raise ImportError(\n \"Argilla is not installed. Please install it using `pip install argilla\"\n \" --upgrade`.\"\n )\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.load","title":"load() ","text":"Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Source code in src/distilabel/steps/argilla/base.py def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n super().load()\n\n if self.api_url is None or self.api_key is None:\n raise DistilabelUserError(\n \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n )\n\n self._client_init()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference","title":"preference ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla","title":"PreferenceToArgilla ","text":" Bases: ArgillaBase Creates a preference dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations. Note This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations , as the ratings and rationales are optional. Attributes: Name Type Description num_generations int The number of generations to include in the dataset. dataset_name int The name of the dataset in Argilla. dataset_workspace int The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url int The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key int The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - instruction (
str ): The instruction that was used to generate the completion. - generations (
List[str] ): The completion that was generated based on the input instruction. - ratings (
List[str] , optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla. - rationales (
List[str] , optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla. Examples: Push a preference dataset to an Argilla instance: from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n It can also include ratings and rationales: result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'instruction',\n# 'generations': ['first_generation', 'second_generation'],\n# 'ratings': ['4', '5'],\n# 'rationales': ['rationale for 4', 'rationale for 5']\n# }\n# ]\n Source code in src/distilabel/steps/argilla/preference.py class PreferenceToArgilla(ArgillaBase):\n \"\"\"Creates a preference dataset in Argilla.\n\n Step that creates a dataset in Argilla during the load phase, and then pushes the input\n batches into it as records. This dataset is a preference dataset, where there's one field\n for the instruction and one extra field per each generation within the same record, and then\n a rating question per each of the generation fields. The rating question asks the annotator to\n set a rating from 1 to 5 for each of the provided generations.\n\n Note:\n This step is meant to be used in conjunction with the `UltraFeedback` step, or any other step\n generating both ratings and responses for a given set of instruction and generations for the\n given instruction. But alternatively, it can also be used with any other task or step generating\n only the `instruction` and `generations`, as the `ratings` and `rationales` are optional.\n\n Attributes:\n num_generations: The number of generations to include in the dataset.\n dataset_name: The name of the dataset in Argilla.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the completion.\n - generations (`List[str]`): The completion that was generated based on the input instruction.\n - ratings (`List[str]`, optional): The ratings for the generations. If not provided, the\n generated ratings won't be pushed to Argilla.\n - rationales (`List[str]`, optional): The rationales for the ratings. If not provided, the\n generated rationales won't be pushed to Argilla.\n\n Examples:\n Push a preference dataset to an Argilla instance:\n\n ```python\n from distilabel.steps import PreferenceToArgilla\n\n to_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n )\n to_argilla.load()\n\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n ```\n\n It can also include ratings and rationales:\n\n ```python\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n )\n # >>> result\n # [\n # {\n # 'instruction': 'instruction',\n # 'generations': ['first_generation', 'second_generation'],\n # 'ratings': ['4', '5'],\n # 'rationales': ['rationale for 4', 'rationale for 5']\n # }\n # ]\n ```\n \"\"\"\n\n num_generations: int\n\n _id: str = PrivateAttr(default=\"id\")\n _instruction: str = PrivateAttr(...)\n _generations: str = PrivateAttr(...)\n _ratings: str = PrivateAttr(...)\n _rationales: str = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n # Both `instruction` and `generations` will be used as the fields of the dataset\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generations = self.input_mappings.get(\"generations\", \"generations\")\n # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField):\n continue\n if (\n field.name\n not in [self._id, self._instruction] # type: ignore\n + [\n f\"{self._generations}-{idx}\"\n for idx in range(self.num_generations)\n ]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n page=\"components-gallery/steps/preferencetoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n *self._generation_fields(), # type: ignore\n ],\n questions=self._rating_rationale_pairs(), # type: ignore\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n\n def _generation_fields(self) -> List[\"TextField\"]:\n \"\"\"Method to generate the fields for each of the generations.\n\n Returns:\n A list containing `TextField`s for each text generation.\n \"\"\"\n return [\n rg.TextField( # type: ignore\n name=f\"{self._generations}-{idx}\",\n title=f\"{self._generations}-{idx}\",\n required=True if idx == 0 else False,\n )\n for idx in range(self.num_generations)\n ]\n\n def _rating_rationale_pairs(\n self,\n ) -> List[Union[\"RatingQuestion\", \"TextQuestion\"]]:\n \"\"\"Method to generate the rating and rationale questions for each of the generations.\n\n Returns:\n A list of questions containing a `RatingQuestion` and `TextQuestion` pair for\n each text generation.\n \"\"\"\n questions = []\n for idx in range(self.num_generations):\n questions.extend(\n [\n rg.RatingQuestion( # type: ignore\n name=f\"{self._generations}-{idx}-rating\",\n title=f\"Rate {self._generations}-{idx} given {self._instruction}.\",\n description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n if idx != 0\n else None,\n values=[1, 2, 3, 4, 5],\n required=True if idx == 0 else False,\n ),\n rg.TextQuestion( # type: ignore\n name=f\"{self._generations}-{idx}-rationale\",\n title=f\"Specify the rationale for {self._generations}-{idx}'s rating.\",\n description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n if idx != 0\n else None,\n required=False,\n ),\n ]\n )\n return questions\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the step are the `instruction` and the `generations`. Optionally, one could also\n provide the `ratings` and the `rationales` for the generations.\"\"\"\n return [\"instruction\", \"generations\"]\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"The optional inputs for the step are the `ratings` and the `rationales` for the generations.\"\"\"\n return [\"ratings\", \"rationales\"]\n\n def _add_suggestions_if_any(self, input: Dict[str, Any]) -> List[\"Suggestion\"]:\n \"\"\"Method to generate the suggestions for the `rg.Record` based on the input.\n\n Returns:\n A list of `Suggestion`s for the rating and rationales questions.\n \"\"\"\n # Since the `suggestions` i.e. answers to the `questions` are optional, will default to {}\n suggestions = []\n # If `ratings` is in `input`, then add those as suggestions\n if self._ratings in input:\n suggestions.extend(\n [\n rg.Suggestion( # type: ignore\n value=rating,\n question_name=f\"{self._generations}-{idx}-rating\",\n )\n for idx, rating in enumerate(input[self._ratings])\n if rating is not None\n and isinstance(rating, int)\n and rating in [1, 2, 3, 4, 5]\n ],\n )\n # If `rationales` is in `input`, then add those as suggestions\n if self._rationales in input:\n suggestions.extend(\n [\n rg.Suggestion( # type: ignore\n value=rationale,\n question_name=f\"{self._generations}-{idx}-rationale\",\n )\n for idx, rationale in enumerate(input[self._rationales])\n if rationale is not None and isinstance(rationale, str)\n ],\n )\n return suggestions\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n generations = {\n f\"{self._generations}-{idx}\": generation\n for idx, generation in enumerate(input[\"generations\"]) # type: ignore\n }\n\n records.append( # type: ignore\n rg.Record( # type: ignore\n fields={\n \"id\": instruction_id,\n \"instruction\": input[\"instruction\"], # type: ignore\n **generations,\n },\n suggestions=self._add_suggestions_if_any(input), # type: ignore\n )\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.inputs","title":"inputs: List[str] property ","text":"The inputs for the step are the instruction and the generations . Optionally, one could also provide the ratings and the rationales for the generations. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.optional_inputs","title":"optional_inputs: List[str] property ","text":"The optional inputs for the step are the ratings and the rationales for the generations. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.load","title":"load() ","text":"Sets the _instruction and _generations attributes based on the inputs_mapping , otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla. Source code in src/distilabel/steps/argilla/preference.py def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n # Both `instruction` and `generations` will be used as the fields of the dataset\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generations = self.input_mappings.get(\"generations\", \"generations\")\n # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField):\n continue\n if (\n field.name\n not in [self._id, self._instruction] # type: ignore\n + [\n f\"{self._generations}-{idx}\"\n for idx in range(self.num_generations)\n ]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n page=\"components-gallery/steps/preferencetoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n *self._generation_fields(), # type: ignore\n ],\n questions=self._rating_rationale_pairs(), # type: ignore\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.process","title":"process(inputs) ","text":"Creates and pushes the records as rg.Record s to the Argilla dataset. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/argilla/preference.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n generations = {\n f\"{self._generations}-{idx}\": generation\n for idx, generation in enumerate(input[\"generations\"]) # type: ignore\n }\n\n records.append( # type: ignore\n rg.Record( # type: ignore\n fields={\n \"id\": instruction_id,\n \"instruction\": input[\"instruction\"], # type: ignore\n **generations,\n },\n suggestions=self._add_suggestions_if_any(input), # type: ignore\n )\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation","title":"text_generation ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla","title":"TextGenerationToArgilla ","text":" Bases: ArgillaBase Creates a text generation dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d). Note This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns. Attributes: Name Type Description dataset_name The name of the dataset in Argilla. dataset_workspace The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - instruction (
str ): The instruction that was used to generate the completion. - generation (
str or List[str] ): The completions that were generated based on the input instruction. Examples: Push a text generation dataset to an Argilla instance: from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n Source code in src/distilabel/steps/argilla/text_generation.py class TextGenerationToArgilla(ArgillaBase):\n \"\"\"Creates a text generation dataset in Argilla.\n\n `Step` that creates a dataset in Argilla during the load phase, and then pushes the input\n batches into it as records. This dataset is a text-generation dataset, where there's one field\n per each input, and then a label question to rate the quality of the completion in either bad\n (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).\n\n Note:\n This step is meant to be used in conjunction with a `TextGeneration` step and no column mapping\n is needed, as it will use the default values for the `instruction` and `generation` columns.\n\n Attributes:\n dataset_name: The name of the dataset in Argilla.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the completion.\n - generation (`str` or `List[str]`): The completions that were generated based on the input instruction.\n\n Examples:\n Push a text generation dataset to an Argilla instance:\n\n ```python\n from distilabel.steps import PreferenceToArgilla\n\n to_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n )\n to_argilla.load()\n\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction', 'generation': 'generation'}]\n ```\n \"\"\"\n\n _id: str = PrivateAttr(default=\"id\")\n _instruction: str = PrivateAttr(...)\n _generation: str = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField): # type: ignore\n continue\n if (\n field.name not in [self._id, self._instruction, self._generation]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n \" so it cannot be reused for this dataset.\",\n page=\"components-gallery/steps/textgenerationtoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n rg.TextField(name=self._generation, title=self._generation), # type: ignore\n ],\n questions=[\n rg.LabelQuestion( # type: ignore\n name=\"quality\",\n title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"}, # type: ignore\n )\n ],\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the step are the `instruction` and the `generation`.\"\"\"\n return [\"instruction\", \"generation\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\")\n ).hexdigest()\n\n generations = input[\"generation\"]\n\n # If the `generation` is not a list, then convert it into a list\n if not isinstance(generations, list):\n generations = [generations]\n\n # Create a `generations_set` to avoid adding duplicates\n generations_set = set()\n\n for generation in generations:\n # If the generation is already in the set, then skip it\n if generation in generations_set:\n continue\n # Otherwise, add it to the set\n generations_set.add(generation)\n\n records.append(\n rg.Record( # type: ignore\n fields={\n self._id: instruction_id,\n self._instruction: input[\"instruction\"],\n self._generation: generation,\n },\n ),\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.inputs","title":"inputs: List[str] property ","text":"The inputs for the step are the instruction and the generation . "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.load","title":"load() ","text":"Sets the _instruction and _generation attributes based on the inputs_mapping , otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla. Source code in src/distilabel/steps/argilla/text_generation.py def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField): # type: ignore\n continue\n if (\n field.name not in [self._id, self._instruction, self._generation]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n \" so it cannot be reused for this dataset.\",\n page=\"components-gallery/steps/textgenerationtoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n rg.TextField(name=self._generation, title=self._generation), # type: ignore\n ],\n questions=[\n rg.LabelQuestion( # type: ignore\n name=\"quality\",\n title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"}, # type: ignore\n )\n ],\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.process","title":"process(inputs) ","text":"Creates and pushes the records as FeedbackRecords to the Argilla dataset. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/argilla/text_generation.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\")\n ).hexdigest()\n\n generations = input[\"generation\"]\n\n # If the `generation` is not a list, then convert it into a list\n if not isinstance(generations, list):\n generations = [generations]\n\n # Create a `generations_set` to avoid adding duplicates\n generations_set = set()\n\n for generation in generations:\n # If the generation is already in the set, then skip it\n if generation in generations_set:\n continue\n # Otherwise, add it to the set\n generations_set.add(generation)\n\n records.append(\n rg.Record( # type: ignore\n fields={\n self._id: instruction_id,\n self._instruction: input[\"instruction\"],\n self._generation: generation,\n },\n ),\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/columns/","title":"Columns","text":"This section contains the existing steps intended to be used for common column operations to apply to the batches. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand","title":"expand ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns","title":"ExpandColumns ","text":" Bases: Step Expand columns that contain lists into multiple rows. ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list. Attributes: Name Type Description columns Union[Dict[str, str], List[str]] A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name. encoded Union[bool, List[str]] A bool to inform Whether the columns are JSON encoded lists. If this value is set to True, the columns will be decoded before expanding. Alternatively, to specify columns that can be encoded, a list can be provided. In this case, the column names informed must be a subset of the columns selected for expansion. split_statistics bool A bool to inform whether the statistics in the distilabel_metadata column should be split into multiple rows. If we want to expand some columns containing a list of strings that come from having parsed the output of an LLM, the tokens in the statistics_{step_name} of the distilabel_metadata column should be splitted to avoid multiplying them if we aggregate the data afterwards. For example, with a task that is supposed to generate a list of N instructions, and we want each of those N instructions in different rows, we should split the statistics by N. In such a case, set this value to True. Input columns - dynamic (determined by
columns attribute): The columns to be expanded into multiple rows. Output columns - dynamic (determined by
columns attribute): The expanded columns. Categories Examples: Expand the selected columns into multiple rows: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n Expand the selected columns which are JSON encoded into multiple rows: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n Expand the selected columns and split the statistics in the distilabel_metadata column: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n Source code in src/distilabel/steps/columns/expand.py class ExpandColumns(Step):\n \"\"\"Expand columns that contain lists into multiple rows.\n\n `ExpandColumns` is a `Step` that takes a list of columns and expands them into multiple\n rows. The new rows will have the same data as the original row, except for the expanded\n column, which will contain a single item from the original list.\n\n Attributes:\n columns: A dictionary that maps the column to be expanded to the new column name\n or a list of columns to be expanded. If a list is provided, the new column name\n will be the same as the column name.\n encoded: A bool to inform Whether the columns are JSON encoded lists. If this value is\n set to True, the columns will be decoded before expanding. Alternatively, to specify\n columns that can be encoded, a list can be provided. In this case, the column names\n informed must be a subset of the columns selected for expansion.\n split_statistics: A bool to inform whether the statistics in the `distilabel_metadata`\n column should be split into multiple rows.\n If we want to expand some columns containing a list of strings that come from\n having parsed the output of an LLM, the tokens in the `statistics_{step_name}`\n of the `distilabel_metadata` column should be splitted to avoid multiplying\n them if we aggregate the data afterwards. For example, with a task that is supposed\n to generate a list of N instructions, and we want each of those N instructions in\n different rows, we should split the statistics by N.\n In such a case, set this value to True.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to be expanded into\n multiple rows.\n\n Output columns:\n - dynamic (determined by `columns` attribute): The expanded columns.\n\n Categories:\n - columns\n\n Examples:\n Expand the selected columns into multiple rows:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n ```\n\n Expand the selected columns which are JSON encoded into multiple rows:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n ```\n\n Expand the selected columns and split the statistics in the `distilabel_metadata` column:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n ```\n \"\"\"\n\n columns: Union[Dict[str, str], List[str]]\n encoded: Union[bool, List[str]] = False\n split_statistics: bool = False\n\n @field_validator(\"columns\")\n @classmethod\n def always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n \"\"\"Ensure that the columns are always a dictionary.\n\n Args:\n value: The columns to be expanded.\n\n Returns:\n The columns to be expanded as a dictionary.\n \"\"\"\n if isinstance(value, list):\n return {col: col for col in value}\n\n return value\n\n @model_validator(mode=\"after\")\n def is_subset(self) -> Self:\n \"\"\"Ensure the \"encoded\" column names are a subset of the \"columns\" selected.\n\n Returns:\n The \"encoded\" attribute updated to work internally.\n \"\"\"\n if isinstance(self.encoded, list):\n if not set(self.encoded).issubset(set(self.columns.keys())):\n raise ValueError(\n \"The 'encoded' columns must be a subset of the 'columns' selected for expansion.\"\n )\n if isinstance(self.encoded, bool):\n self.encoded = list(self.columns.keys()) if self.encoded else []\n return self\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The columns to be expanded.\"\"\"\n return list(self.columns.keys())\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The expanded columns.\"\"\"\n return [\n new_column if new_column else expand_column\n for expand_column, new_column in self.columns.items()\n ]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Expand the columns in the input data.\n\n Args:\n inputs: The input data.\n\n Yields:\n The expanded rows.\n \"\"\"\n if self.encoded:\n for input in inputs:\n for column in self.encoded:\n input[column] = json.loads(input[column])\n\n yield [row for input in inputs for row in self._expand_columns(input)]\n\n def _expand_columns(self, input: Dict[str, Any]) -> List[Dict[str, Any]]:\n \"\"\"Expand the columns in the input data.\n\n Args:\n input: The input data.\n\n Returns:\n The expanded rows.\n \"\"\"\n metadata_visited = False\n expanded_rows = []\n # Update the columns here to avoid doing the validation on the `inputs`, as the\n # `distilabel_metadata` is not defined on Pipeline creation on the DAG.\n columns = self.columns\n if self.split_statistics:\n columns[\"distilabel_metadata\"] = \"distilabel_metadata\"\n\n for expand_column, new_column in columns.items(): # type: ignore\n data = input.get(expand_column)\n input, metadata_visited = self._split_metadata(\n input, len(data), metadata_visited\n )\n\n rows = []\n for item, expanded in zip_longest(*[data, expanded_rows], fillvalue=input):\n rows.append({**expanded, new_column: item})\n expanded_rows = rows\n return expanded_rows\n\n def _split_metadata(\n self, input: Dict[str, Any], n: int, metadata_visited: bool = False\n ) -> None:\n \"\"\"Help method to split the statistics in `distilabel_metadata` column.\n\n Args:\n input: The input data.\n n: Number of splits to apply to the tokens (if we have 12 tokens and want to split\n them 3 times, n==3).\n metadata_visited: Bool to prevent from updating the data more than once.\n\n Returns:\n Updated input with the `distilabel_metadata` updated.\n \"\"\"\n # - If we want to split the statistics, we need to ensure that the metadata is present.\n # - Metadata can only be visited once per row to avoid successive splitting.\n # TODO: For an odd number of tokens, this will miss 1, we have to fix it.\n if (\n self.split_statistics\n and (metadata := input.get(\"distilabel_metadata\", {}))\n and not metadata_visited\n ):\n for k, v in metadata.items():\n if (\n not v\n ): # In case it wasn't found in the metadata for some error, skip it\n continue\n if k.startswith(\"statistics_\") and (\n \"input_tokens\" in v and \"output_tokens\" in v\n ):\n # For num_generations>1 we assume all the tokens should be divided by n\n # TODO: The tokens should always come as a list, but there can\n # be differences\n if isinstance(v[\"input_tokens\"], list):\n input_tokens = [value // n for value in v[\"input_tokens\"]]\n else:\n input_tokens = [v[\"input_tokens\"] // n]\n if isinstance(v[\"input_tokens\"], list):\n output_tokens = [value // n for value in v[\"output_tokens\"]]\n else:\n output_tokens = [v[\"output_tokens\"] // n]\n\n input[\"distilabel_metadata\"][k] = {\n \"input_tokens\": input_tokens,\n \"output_tokens\": output_tokens,\n }\n metadata_visited = True\n # Once we have updated the metadata, Create a list out of it to let the\n # following section to expand it as any other column.\n if isinstance(input[\"distilabel_metadata\"], dict):\n input[\"distilabel_metadata\"] = [input[\"distilabel_metadata\"]] * n\n return input, metadata_visited\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.inputs","title":"inputs: StepColumns property ","text":"The columns to be expanded. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.outputs","title":"outputs: StepColumns property ","text":"The expanded columns. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.always_dict","title":"always_dict(value) classmethod ","text":"Ensure that the columns are always a dictionary. Parameters: Name Type Description Default value Union[Dict[str, str], List[str]] The columns to be expanded. required Returns: Type Description Dict[str, str] The columns to be expanded as a dictionary. Source code in src/distilabel/steps/columns/expand.py @field_validator(\"columns\")\n@classmethod\ndef always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n \"\"\"Ensure that the columns are always a dictionary.\n\n Args:\n value: The columns to be expanded.\n\n Returns:\n The columns to be expanded as a dictionary.\n \"\"\"\n if isinstance(value, list):\n return {col: col for col in value}\n\n return value\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.is_subset","title":"is_subset() ","text":"Ensure the \"encoded\" column names are a subset of the \"columns\" selected. Returns: Type Description Self The \"encoded\" attribute updated to work internally. Source code in src/distilabel/steps/columns/expand.py @model_validator(mode=\"after\")\ndef is_subset(self) -> Self:\n \"\"\"Ensure the \"encoded\" column names are a subset of the \"columns\" selected.\n\n Returns:\n The \"encoded\" attribute updated to work internally.\n \"\"\"\n if isinstance(self.encoded, list):\n if not set(self.encoded).issubset(set(self.columns.keys())):\n raise ValueError(\n \"The 'encoded' columns must be a subset of the 'columns' selected for expansion.\"\n )\n if isinstance(self.encoded, bool):\n self.encoded = list(self.columns.keys()) if self.encoded else []\n return self\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.process","title":"process(inputs) ","text":"Expand the columns in the input data. Parameters: Name Type Description Default inputs StepInput The input data. required Yields: Type Description StepOutput The expanded rows. Source code in src/distilabel/steps/columns/expand.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Expand the columns in the input data.\n\n Args:\n inputs: The input data.\n\n Yields:\n The expanded rows.\n \"\"\"\n if self.encoded:\n for input in inputs:\n for column in self.encoded:\n input[column] = json.loads(input[column])\n\n yield [row for input in inputs for row in self._expand_columns(input)]\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep","title":"keep ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns","title":"KeepColumns ","text":" Bases: Step Keeps selected columns in the dataset. KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs . Note The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to keep. Input columns - dynamic (determined by
columns attribute): The columns to keep. Output columns - dynamic (determined by
columns attribute): The columns that were kept. Categories Examples: Select the columns to keep: from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n Source code in src/distilabel/steps/columns/keep.py class KeepColumns(Step):\n \"\"\"Keeps selected columns in the dataset.\n\n `KeepColumns` is a `Step` that implements the `process` method that keeps only the columns\n specified in the `columns` attribute. Also `KeepColumns` provides an attribute `columns` to\n specify the columns to keep which will override the default value for the properties `inputs`\n and `outputs`.\n\n Note:\n The order in which the columns are provided is important, as the output will be sorted\n using the provided order, which is useful before pushing either a `dataset.Dataset` via\n the `PushToHub` step or a `distilabel.Distiset` via the `Pipeline.run` output variable.\n\n Attributes:\n columns: List of strings with the names of the columns to keep.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to keep.\n\n Output columns:\n - dynamic (determined by `columns` attribute): The columns that were kept.\n\n Categories:\n - columns\n\n Examples:\n Select the columns to keep:\n\n ```python\n from distilabel.steps import KeepColumns\n\n keep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n )\n keep_columns.load()\n\n result = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n )\n # >>> result\n # [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n ```\n \"\"\"\n\n columns: List[str]\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @override\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n Args:\n *inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n outputs = []\n for item in input:\n outputs.append({col: item[col] for col in self.columns})\n yield outputs\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.process","title":"process(*inputs) ","text":"The process method keeps only the columns specified in the columns attribute. Parameters: Name Type Description Default *inputs StepInput A list of dictionaries with the input data. () Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/columns/keep.py @override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n Args:\n *inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n outputs = []\n for item in input:\n outputs.append({col: item[col] for col in self.columns})\n yield outputs\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge","title":"merge ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge.MergeColumns","title":"MergeColumns ","text":" Bases: Step Merge columns from a row. MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput . MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column. This step can be useful if you have a Task that generates instructions for example, and you want to have more examples of those. In such a case, you could for example use another Task to multiply your instructions synthetically, what would yield two different columns splitted. Using MergeColumns you can merge them and use them as a single column in your dataset for further processing. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to merge. output_column Optional[str] str name of the output column Input columns - dynamic (determined by
columns attribute): The columns to merge. Output columns - dynamic (determined by
columns and output_column attributes): The columns that were merged. Categories Examples: Combine columns in rows of a dataset: from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n Source code in src/distilabel/steps/columns/merge.py class MergeColumns(Step):\n \"\"\"Merge columns from a row.\n\n `MergeColumns` is a `Step` that implements the `process` method that calls the `merge_columns`\n function to handle and combine columns in a `StepInput`. `MergeColumns` provides two attributes\n `columns` and `output_column` to specify the columns to merge and the resulting output column.\n\n This step can be useful if you have a `Task` that generates instructions for example, and you\n want to have more examples of those. In such a case, you could for example use another `Task`\n to multiply your instructions synthetically, what would yield two different columns splitted.\n Using `MergeColumns` you can merge them and use them as a single column in your dataset for\n further processing.\n\n Attributes:\n columns: List of strings with the names of the columns to merge.\n output_column: str name of the output column\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to merge.\n\n Output columns:\n - dynamic (determined by `columns` and `output_column` attributes): The columns\n that were merged.\n\n Categories:\n - columns\n\n Examples:\n Combine columns in rows of a dataset:\n\n ```python\n from distilabel.steps import MergeColumns\n\n combiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n )\n combiner.load()\n\n result = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n )\n # >>> result\n # [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n ```\n \"\"\"\n\n columns: List[str]\n output_column: Optional[str] = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [self.output_column] if self.output_column else [\"merged_column\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n combined = []\n for input in inputs:\n combined.append(\n merge_columns(\n input,\n columns=self.columns,\n new_column=self.outputs[0],\n )\n )\n yield combined\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group","title":"group ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns","title":"GroupColumns ","text":" Bases: Step Combines columns from a list of StepInput . GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput . Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs , respectively. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to group. output_columns Optional[List[str]] Optional list of strings with the names of the output columns. Input columns - dynamic (determined by
columns attribute): The columns to group. Output columns - dynamic (determined by
columns and output_columns attributes): The columns that were grouped. Categories Examples: Group columns of a dataset:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n```\n\nSpecify the name of the output columns:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n```\n Source code in src/distilabel/steps/columns/group.py class GroupColumns(Step):\n \"\"\"Combines columns from a list of `StepInput`.\n\n `GroupColumns` is a `Step` that implements the `process` method that calls the `group_dicts`\n function to handle and combine a list of `StepInput`. Also `GroupColumns` provides two attributes\n `columns` and `output_columns` to specify the columns to group and the output columns\n which will override the default value for the properties `inputs` and `outputs`, respectively.\n\n Attributes:\n columns: List of strings with the names of the columns to group.\n output_columns: Optional list of strings with the names of the output columns.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to group.\n\n Output columns:\n - dynamic (determined by `columns` and `output_columns` attributes): The columns\n that were grouped.\n\n Categories:\n - columns\n\n Examples:\n\n Group columns of a dataset:\n\n ```python\n from distilabel.steps import GroupColumns\n\n group_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n )\n group_columns.load()\n\n result = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n )\n # >>> result\n # [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n ```\n\n Specify the name of the output columns:\n\n ```python\n from distilabel.steps import GroupColumns\n\n group_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n )\n group_columns.load()\n\n result = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n )\n # >>> result\n #[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n ```\n \"\"\"\n\n columns: List[str]\n output_columns: Optional[List[str]] = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task are the column names in `output_columns` or\n `grouped_{column}` for each column in `columns`.\"\"\"\n return (\n self.output_columns\n if self.output_columns is not None\n else [f\"grouped_{column}\" for column in self.columns]\n )\n\n @override\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n \"\"\"\n yield group_columns(\n *inputs,\n group_columns=self.inputs,\n output_group_columns=self.outputs,\n )\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task are the column names in output_columns or grouped_{column} for each column in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.process","title":"process(*inputs) ","text":"The process method calls the group_dicts function to handle and combine a list of StepInput . Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with the combined StepInput using the group_dicts function. Source code in src/distilabel/steps/columns/group.py @override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n \"\"\"\n yield group_columns(\n *inputs,\n group_columns=self.inputs,\n output_group_columns=self.outputs,\n )\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.CombineColumns","title":"CombineColumns ","text":" Bases: GroupColumns CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. Source code in src/distilabel/steps/columns/group.py class CombineColumns(GroupColumns):\n \"\"\"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\"\"\"\n\n def __init__(self, **data: Any) -> None:\n warnings.warn(\n \"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\",\n DeprecationWarning,\n stacklevel=2,\n )\n return super().__init__(**data)\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils","title":"utils ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_distilabel_metadata","title":"merge_distilabel_metadata(*output_dicts) ","text":"Merge the DISTILABEL_METADATA_KEY from multiple output dictionaries. Parameters: Name Type Description Default *output_dicts Dict[str, Any] Variable number of dictionaries containing distilabel metadata. () Returns: Type Description Dict[str, Any] A merged dictionary containing all the distilabel metadata from the input dictionaries. Source code in src/distilabel/steps/columns/utils.py def merge_distilabel_metadata(*output_dicts: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"\n Merge the `DISTILABEL_METADATA_KEY` from multiple output dictionaries.\n\n Args:\n *output_dicts: Variable number of dictionaries containing distilabel metadata.\n\n Returns:\n A merged dictionary containing all the distilabel metadata from the input dictionaries.\n \"\"\"\n merged_metadata = defaultdict(list)\n\n for output_dict in output_dicts:\n metadata = output_dict.get(DISTILABEL_METADATA_KEY, {})\n for key, value in metadata.items():\n merged_metadata[key].append(value)\n\n final_metadata = {}\n for key, value_list in merged_metadata.items():\n if len(value_list) == 1:\n final_metadata[key] = value_list[0]\n else:\n final_metadata[key] = value_list\n\n return final_metadata\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.group_columns","title":"group_columns(*inputs, group_columns, output_group_columns=None) ","text":"Groups multiple list of dictionaries into a single list of dictionaries on the specified group_columns . If group_columns are provided, then it will also rename group_columns . Parameters: Name Type Description Default inputs StepInput list of dictionaries to combine. () group_columns List[str] list of keys to merge on. required output_group_columns Optional[List[str]] list of keys to rename the merge keys to. Defaults to None . None Returns: Type Description StepInput A list of dictionaries where the values of the group_columns are combined into a StepInput list and renamed to output_group_columns . Source code in src/distilabel/steps/columns/utils.py def group_columns(\n *inputs: \"StepInput\",\n group_columns: List[str],\n output_group_columns: Optional[List[str]] = None,\n) -> \"StepInput\":\n \"\"\"Groups multiple list of dictionaries into a single list of dictionaries on the\n specified `group_columns`. If `group_columns` are provided, then it will also rename\n `group_columns`.\n\n Args:\n inputs: list of dictionaries to combine.\n group_columns: list of keys to merge on.\n output_group_columns: list of keys to rename the merge keys to. Defaults to `None`.\n\n Returns:\n A list of dictionaries where the values of the `group_columns` are combined into a\n list and renamed to `output_group_columns`.\n \"\"\"\n if output_group_columns is not None and len(output_group_columns) != len(\n group_columns\n ):\n raise ValueError(\n \"The length of `output_group_columns` must be the same as the length of `group_columns`.\"\n )\n if output_group_columns is None:\n output_group_columns = [f\"grouped_{key}\" for key in group_columns]\n group_columns_dict = dict(zip(group_columns, output_group_columns))\n\n result = []\n # Use zip to iterate over lists based on their index\n for dicts_at_index in zip(*inputs):\n combined_dict = {}\n metadata_dicts = []\n # Iterate over dicts at the same index\n for d in dicts_at_index:\n # Extract metadata for merging\n if DISTILABEL_METADATA_KEY in d:\n metadata_dicts.append(\n {DISTILABEL_METADATA_KEY: d[DISTILABEL_METADATA_KEY]}\n )\n # Iterate over key-value pairs in each dict\n for key, value in d.items():\n if key == DISTILABEL_METADATA_KEY:\n continue\n # If the key is in the merge_keys, append the value to the existing list\n if key in group_columns_dict.keys():\n combined_dict.setdefault(group_columns_dict[key], []).append(value)\n # If the key is not in the merge_keys, create a new key-value pair\n else:\n combined_dict[key] = value\n\n if metadata_dicts:\n combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n *metadata_dicts\n )\n\n result.append(combined_dict)\n return result\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_columns","title":"merge_columns(row, columns, new_column='combined_key') ","text":"Merge columns in a dictionary into a single column on the specified new_column . Parameters: Name Type Description Default row Dict[str, Any] Dictionary corresponding to a row in a dataset. required columns List[str] List of keys to merge. required new_column str Name of the new key created. 'combined_key' Returns: Type Description Dict[str, Any] Dictionary with the new merged key. Source code in src/distilabel/steps/columns/utils.py def merge_columns(\n row: Dict[str, Any], columns: List[str], new_column: str = \"combined_key\"\n) -> Dict[str, Any]:\n \"\"\"Merge columns in a dictionary into a single column on the specified `new_column`.\n\n Args:\n row: Dictionary corresponding to a row in a dataset.\n columns: List of keys to merge.\n new_column: Name of the new key created.\n\n Returns:\n Dictionary with the new merged key.\n \"\"\"\n result = row.copy() # preserve the original dictionary\n combined = []\n for key in columns:\n to_combine = result.pop(key)\n if not isinstance(to_combine, list):\n to_combine = [to_combine]\n combined += to_combine\n result[new_column] = combined\n return result\n "},{"location":"api/step_gallery/extra/","title":"Extra","text":""},{"location":"api/step_gallery/extra/#distilabel.steps","title":"steps ","text":""},{"location":"api/step_gallery/extra/#distilabel.steps.DBSCAN","title":"DBSCAN ","text":" Bases: GlobalStep DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density. This is a GlobalStep that clusters the embeddings using the DBSCAN algorithm from sklearn . Visit TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. Input columns - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. Output columns - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. Categories - clustering
- text-classification
References DBSCAN demo of sklearn sklearn dbscan Attributes: Name Type Description - eps The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs The number of parallel jobs to run. Runtime parameters eps : The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. metric : The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. n_jobs : The number of parallel jobs to run. Source code in src/distilabel/steps/clustering/dbscan.py class DBSCAN(GlobalStep):\n r\"\"\"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core\n samples in regions of high density and expands clusters from them. This algorithm\n is good for data which contains clusters of similar density.\n\n This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\n from `sklearn`. Visit `TextClustering` step for an example of use.\n The trained model is saved as an artifact when creating a distiset\n and pushing it to the Hugging Face Hub.\n\n Input columns:\n - projection (`List[float]`): Vector representation of the text to cluster,\n normally the output from the `UMAP` step.\n\n Output columns:\n - cluster_label (`int`): Integer representing the label of a given cluster. -1\n means it wasn't clustered.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`DBSCAN demo of sklearn`](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#demo-of-dbscan-clustering-algorithm)\n - [`sklearn dbscan`](https://scikit-learn.org/stable/modules/clustering.html#dbscan)\n\n Attributes:\n - eps: The maximum distance between two samples for one to be considered as in the\n neighborhood of the other. This is not a maximum bound on the distances of\n points within a cluster. This is the most important DBSCAN parameter to\n choose appropriately for your data set and distance function.\n - min_samples: The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself. If `min_samples`\n is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n to a lower value, the found clusters will be more sparse.\n - metric: The metric to use when calculating distance between instances in a feature\n array. If metric is a string or callable, it must be one of the options allowed\n by `sklearn.metrics.pairwise_distances` for its metric parameter.\n - n_jobs: The number of parallel jobs to run.\n\n Runtime parameters:\n - `eps`: The maximum distance between two samples for one to be considered as in the\n neighborhood of the other. This is not a maximum bound on the distances of\n points within a cluster. This is the most important DBSCAN parameter to\n choose appropriately for your data set and distance function.\n - `min_samples`: The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself. If `min_samples`\n is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n to a lower value, the found clusters will be more sparse.\n - `metric`: The metric to use when calculating distance between instances in a feature\n array. If metric is a string or callable, it must be one of the options allowed\n by `sklearn.metrics.pairwise_distances` for its metric parameter.\n - `n_jobs`: The number of parallel jobs to run.\n \"\"\"\n\n eps: Optional[RuntimeParameter[float]] = Field(\n default=0.3,\n description=(\n \"The maximum distance between two samples for one to be considered \"\n \"as in the neighborhood of the other. This is not a maximum bound \"\n \"on the distances of points within a cluster. This is the most \"\n \"important DBSCAN parameter to choose appropriately for your data set \"\n \"and distance function.\"\n ),\n )\n min_samples: Optional[RuntimeParameter[int]] = Field(\n default=30,\n description=(\n \"The number of samples (or total weight) in a neighborhood for a point to \"\n \"be considered as a core point. This includes the point itself. If \"\n \"`min_samples` is set to a higher value, DBSCAN will find denser clusters, \"\n \"whereas if it is set to a lower value, the found clusters will be more \"\n \"sparse.\"\n ),\n )\n metric: Optional[RuntimeParameter[str]] = Field(\n default=\"euclidean\",\n description=(\n \"The metric to use when calculating distance between instances in a \"\n \"feature array. If metric is a string or callable, it must be one of \"\n \"the options allowed by `sklearn.metrics.pairwise_distances` for \"\n \"its metric parameter.\"\n ),\n )\n n_jobs: Optional[RuntimeParameter[int]] = Field(\n default=8, description=\"The number of parallel jobs to run.\"\n )\n\n _clusterer: Optional[\"_DBSCAN\"] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if importlib.util.find_spec(\"sklearn\") is None:\n raise ImportError(\n \"`sklearn` package is not installed. Please install it using `pip install scikit-learn`.\"\n )\n from sklearn.cluster import DBSCAN as _DBSCAN\n\n self._clusterer = _DBSCAN(\n eps=self.eps,\n min_samples=self.min_samples,\n metric=self.metric,\n n_jobs=self.n_jobs,\n )\n\n def unload(self) -> None:\n self._clusterer = None\n\n @property\n def inputs(self) -> List[str]:\n return [\"projection\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"cluster_label\"]\n\n def _save_model(self, model: Any) -> None:\n import joblib\n\n def save_model(path):\n with open(str(path / \"DBSCAN.joblib\"), \"wb\") as f:\n joblib.dump(model, f)\n\n self.save_artifact(\n name=\"DBSCAN_model\",\n write_function=lambda path: save_model(path),\n metadata={\n \"eps\": self.eps,\n \"min_samples\": self.min_samples,\n \"metric\": self.metric,\n },\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n projections = np.array([input[\"projection\"] for input in inputs])\n\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start training DBSCAN...\")\n fitted_clusterer = self._clusterer.fit(projections)\n cluster_labels = fitted_clusterer.labels_\n # Sets the cluster labels for each input, -1 means it wasn't clustered\n for input, cluster_label in zip(inputs, cluster_labels):\n input[\"cluster_label\"] = cluster_label\n self._logger.info(f\"DBSCAN labels assigned: {len(set(cluster_labels))}\")\n self._save_model(fitted_clusterer)\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering","title":"TextClustering ","text":" Bases: TextClassification , GlobalTask Task that clusters a set of texts and generates summary labels for each cluster. This is a GlobalTask that inherits from TextClassification , this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering Input columns - text (
str ): The reference text we want to obtain labels for. - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. Output columns - summary_label (
str ): The label or list of labels for the text. - model_name (
str ): The name of the model used to generate the label/s. Categories - clustering
- text-classification
References text-clustering repository Attributes: Name Type Description - savefig Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster The number of examples to use in the LLM as a sample of the cluster. Examples: Generate labels for a set of texts using clustering: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n Source code in src/distilabel/steps/clustering/text_clustering.py class TextClustering(TextClassification, GlobalTask):\n \"\"\"Task that clusters a set of texts and generates summary labels for each cluster.\n\n This is a `GlobalTask` that inherits from `TextClassification`, this means that all\n the attributes from that class are available here. Also, in this case we deal\n with all the inputs at once, instead of using batches. The `input_batch_size` is\n used here to send the examples to the LLM in batches (a subtle difference with the\n more common `Task` definitions).\n The task looks in each cluster for a given number of representative examples (the number\n is set by the `samples_per_cluster` attribute), and sends them to the LLM to get a label/s\n that represent the cluster. The labels are then assigned to each text in the cluster.\n The clusters and projections used in the step, are assumed to be obtained from the `UMAP`\n + `DBSCAN` steps, but could be generated for similar steps, as long as they represent the\n same concepts.\n This step runs a pipeline like the one in this repository:\n https://github.com/huggingface/text-clustering\n\n Input columns:\n - text (`str`): The reference text we want to obtain labels for.\n - projection (`List[float]`): Vector representation of the text to cluster,\n normally the output from the `UMAP` step.\n - cluster_label (`int`): Integer representing the label of a given cluster. -1\n means it wasn't clustered.\n\n Output columns:\n - summary_label (`str`): The label or list of labels for the text.\n - model_name (`str`): The name of the model used to generate the label/s.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`text-clustering repository`](https://github.com/huggingface/text-clustering)\n\n Attributes:\n - savefig: Whether to generate and save a figure with the clustering of the texts.\n - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.\n\n Examples:\n Generate labels for a set of texts using clustering:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps import UMAP, DBSCAN, TextClustering\n from distilabel.pipeline import Pipeline\n\n ds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\n with Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n ```\n \"\"\"\n\n savefig: Optional[RuntimeParameter[bool]] = Field(\n default=True,\n description=\"Whether to generate and save a figure with the clustering of the texts.\",\n )\n samples_per_cluster: int = Field(\n default=10,\n description=\"The number of examples to use in the LLM as a sample of the cluster.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the same as those for `TextClassification` plus\n the `projection` and `cluster_label` columns (which can be obtained from\n UMAP + DBSCAN steps).\n \"\"\"\n return super().inputs + [\"projection\", \"cluster_label\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `summary_label` and the `model_name`.\"\"\"\n return [\"summary_label\", \"model_name\"]\n\n def load(self) -> None:\n super().load()\n if self.savefig and (importlib.util.find_spec(\"matplotlib\") is None):\n raise ImportError(\n \"`matplotlib` package is not installed. Please install it using `pip install matplotlib`.\"\n )\n\n def _save_figure(\n self,\n data: pd.DataFrame,\n cluster_centers: Dict[str, Tuple[float, float]],\n cluster_summaries: Dict[int, str],\n ) -> None:\n \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n Args:\n data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n the projections and the label of each text respectively.\n cluster_centers: Dictionary mapping from each label the center of a cluster,\n to help with the placement of the annotations.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n import matplotlib.pyplot as plt\n\n fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n unique_labels = data[\"labels\"].unique()\n # Map of colors for each label (-1 is black)\n colormap = dict(\n zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n )\n colormap[-1] = np.array([0, 0, 0, 0])\n data[\"color\"] = data[\"labels\"].map(colormap)\n\n data.plot(\n kind=\"scatter\",\n x=\"X\",\n y=\"Y\",\n c=\"color\",\n s=0.75,\n alpha=0.8,\n linewidth=0.4,\n ax=ax,\n colorbar=False,\n )\n\n for label in cluster_summaries.keys():\n if label == -1:\n continue\n summary = str(cluster_summaries[label]) # These are obtained from the LLM\n position = cluster_centers[label]\n t = ax.text(\n position[0],\n position[1],\n summary,\n horizontalalignment=\"center\",\n verticalalignment=\"center\",\n fontsize=4,\n )\n t.set_bbox(\n {\n \"facecolor\": \"white\",\n \"alpha\": 0.9,\n \"linewidth\": 0,\n \"boxstyle\": \"square,pad=0.1\",\n }\n )\n\n ax.set_axis_off()\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"Text clusters\",\n write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n plt.close()\n\n def _create_figure(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[str]],\n cluster_summaries: Dict[int, str],\n ) -> None:\n \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n Args:\n inputs: The inputs of the step, as we will extract information from them again.\n label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n labels = []\n projections = []\n id2cluster = {}\n for i, input in enumerate(inputs):\n label = input[\"cluster_label\"]\n id2cluster[i] = label\n labels.append(label)\n projections.append(input[\"projection\"])\n\n projections = np.array(projections)\n\n # Contains the placement of the cluster centers in the figure\n cluster_centers: Dict[str, Tuple[float, float]] = {}\n for label in label2docs.keys():\n x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n cluster_centers[label] = (x, y)\n\n df = pd.DataFrame(\n data={\n \"X\": projections[:, 0],\n \"Y\": projections[:, 1],\n \"labels\": labels,\n }\n )\n\n self._save_figure(\n df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n )\n\n def _prepare_input_texts(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[int]],\n unique_labels: List[int],\n ) -> List[Dict[str, Union[str, int]]]:\n \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n Args:\n inputs: Inputs from the step.\n label2docs: Map from each label to the list of documents (texts) that\n belong to that cluster.\n unique_labels: The unique labels of the clusters.\n\n Returns:\n The input texts to send to the LLM, with the examples of each cluster\n prepared to be used in the prompt, and an additional key to store the\n labels (that will be needed to find the data after the batches are\n returned from the LLM).\n \"\"\"\n input_texts = []\n for label in range(unique_labels): # The label -1 is implicitly excluded\n # Get the ids but remove possible duplicates, which could happen with bigger probability\n # the bigger the number of examples requested, and the smaller the subset of examples\n ids = set(\n np.random.choice(label2docs[label], size=self.samples_per_cluster)\n ) # Grab the number of examples\n examples = [inputs[i][\"text\"] for i in ids]\n input_text = {\n \"text\": \"\\n\\n\".join(\n [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n ),\n \"__LABEL\": label,\n }\n input_texts.append(input_text)\n return input_texts\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n labels = [input[\"cluster_label\"] for input in inputs]\n # -1 because -1 is the label for the unclassified\n unique_labels = len(set(labels)) - 1\n # This will be the output of the LLM, the set of labels for each cluster\n cluster_summaries: Dict[int, str] = {-1: self.default_label}\n\n # Map from label to list of documents, will use them to select examples from each cluster\n label2docs = defaultdict(list)\n for i, label in enumerate(labels):\n label2docs[label].append(i)\n\n input_texts = self._prepare_input_texts(inputs, label2docs, unique_labels)\n\n # Send the texts in batches to the LLM, and get the labels for each cluster\n for i, batched_inputs in enumerate(batched(input_texts, self.input_batch_size)):\n self._logger.info(f\"\ud83d\udce6 Processing internal batch of inputs {i}...\")\n results = super().process(batched_inputs)\n for result in next(results): # Extract the elements from the generator\n cluster_summaries[result[\"__LABEL\"]] = result[\"labels\"]\n\n # Assign the labels to each text\n for input in inputs:\n input[\"summary_label\"] = json.dumps(\n cluster_summaries[input[\"cluster_label\"]]\n )\n\n if self.savefig:\n self._create_figure(inputs, label2docs, cluster_summaries)\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.inputs","title":"inputs: List[str] property ","text":"The input for the task are the same as those for TextClassification plus the projection and cluster_label columns (which can be obtained from UMAP + DBSCAN steps). "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.outputs","title":"outputs: List[str] property ","text":"The output for the task is the summary_label and the model_name . "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._save_figure","title":"_save_figure(data, cluster_centers, cluster_summaries) ","text":"Saves the figure starting from the dataframe, using matplotlib. Parameters: Name Type Description Default data DataFrame pd.DataFrame with the columns 'X', 'Y' and 'labels' representing the projections and the label of each text respectively. required cluster_centers Dict[str, Tuple[float, float]] Dictionary mapping from each label the center of a cluster, to help with the placement of the annotations. required cluster_summaries Dict[int, str] The summaries of the clusters, obtained from the LLM. required Source code in src/distilabel/steps/clustering/text_clustering.py def _save_figure(\n self,\n data: pd.DataFrame,\n cluster_centers: Dict[str, Tuple[float, float]],\n cluster_summaries: Dict[int, str],\n) -> None:\n \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n Args:\n data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n the projections and the label of each text respectively.\n cluster_centers: Dictionary mapping from each label the center of a cluster,\n to help with the placement of the annotations.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n import matplotlib.pyplot as plt\n\n fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n unique_labels = data[\"labels\"].unique()\n # Map of colors for each label (-1 is black)\n colormap = dict(\n zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n )\n colormap[-1] = np.array([0, 0, 0, 0])\n data[\"color\"] = data[\"labels\"].map(colormap)\n\n data.plot(\n kind=\"scatter\",\n x=\"X\",\n y=\"Y\",\n c=\"color\",\n s=0.75,\n alpha=0.8,\n linewidth=0.4,\n ax=ax,\n colorbar=False,\n )\n\n for label in cluster_summaries.keys():\n if label == -1:\n continue\n summary = str(cluster_summaries[label]) # These are obtained from the LLM\n position = cluster_centers[label]\n t = ax.text(\n position[0],\n position[1],\n summary,\n horizontalalignment=\"center\",\n verticalalignment=\"center\",\n fontsize=4,\n )\n t.set_bbox(\n {\n \"facecolor\": \"white\",\n \"alpha\": 0.9,\n \"linewidth\": 0,\n \"boxstyle\": \"square,pad=0.1\",\n }\n )\n\n ax.set_axis_off()\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"Text clusters\",\n write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n plt.close()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._create_figure","title":"_create_figure(inputs, label2docs, cluster_summaries) ","text":"Creates a figure of the clustered texts and save it as an artifact. Parameters: Name Type Description Default inputs StepInput The inputs of the step, as we will extract information from them again. required label2docs Dict[int, List[str]] Map from each label to the list of documents (texts) that belong to that cluster. required cluster_summaries Dict[int, str] The summaries of the clusters, obtained from the LLM. required Source code in src/distilabel/steps/clustering/text_clustering.py def _create_figure(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[str]],\n cluster_summaries: Dict[int, str],\n) -> None:\n \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n Args:\n inputs: The inputs of the step, as we will extract information from them again.\n label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n labels = []\n projections = []\n id2cluster = {}\n for i, input in enumerate(inputs):\n label = input[\"cluster_label\"]\n id2cluster[i] = label\n labels.append(label)\n projections.append(input[\"projection\"])\n\n projections = np.array(projections)\n\n # Contains the placement of the cluster centers in the figure\n cluster_centers: Dict[str, Tuple[float, float]] = {}\n for label in label2docs.keys():\n x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n cluster_centers[label] = (x, y)\n\n df = pd.DataFrame(\n data={\n \"X\": projections[:, 0],\n \"Y\": projections[:, 1],\n \"labels\": labels,\n }\n )\n\n self._save_figure(\n df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._prepare_input_texts","title":"_prepare_input_texts(inputs, label2docs, unique_labels) ","text":"Prepares a batch of inputs to send to the LLM, with the examples of each cluster. Parameters: Name Type Description Default inputs StepInput Inputs from the step. required label2docs Dict[int, List[int]] Map from each label to the list of documents (texts) that belong to that cluster. required unique_labels List[int] The unique labels of the clusters. required Returns: Type Description List[Dict[str, Union[str, int]]] The input texts to send to the LLM, with the examples of each cluster List[Dict[str, Union[str, int]]] prepared to be used in the prompt, and an additional key to store the List[Dict[str, Union[str, int]]] labels (that will be needed to find the data after the batches are List[Dict[str, Union[str, int]]] returned from the LLM). Source code in src/distilabel/steps/clustering/text_clustering.py def _prepare_input_texts(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[int]],\n unique_labels: List[int],\n) -> List[Dict[str, Union[str, int]]]:\n \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n Args:\n inputs: Inputs from the step.\n label2docs: Map from each label to the list of documents (texts) that\n belong to that cluster.\n unique_labels: The unique labels of the clusters.\n\n Returns:\n The input texts to send to the LLM, with the examples of each cluster\n prepared to be used in the prompt, and an additional key to store the\n labels (that will be needed to find the data after the batches are\n returned from the LLM).\n \"\"\"\n input_texts = []\n for label in range(unique_labels): # The label -1 is implicitly excluded\n # Get the ids but remove possible duplicates, which could happen with bigger probability\n # the bigger the number of examples requested, and the smaller the subset of examples\n ids = set(\n np.random.choice(label2docs[label], size=self.samples_per_cluster)\n ) # Grab the number of examples\n examples = [inputs[i][\"text\"] for i in ids]\n input_text = {\n \"text\": \"\\n\\n\".join(\n [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n ),\n \"__LABEL\": label,\n }\n input_texts.append(input_text)\n return input_texts\n "},{"location":"api/step_gallery/extra/#distilabel.steps.UMAP","title":"UMAP ","text":" Bases: GlobalStep UMAP is a general purpose manifold learning and dimension reduction algorithm. This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. Input columns - embedding (
List[float] ): The original embeddings we want to reduce the dimension. Output columns - projection (
List[float] ): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components . Categories - clustering
- text-classification
References UMAP repository UMAP documentation Attributes: Name Type Description - n_components The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . - n_jobs The number of parallel jobs to run. Defaults to 8 . - random_state The random state to use for the UMAP algorithm. Runtime parameters n_components : The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. metric : The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . n_jobs : The number of parallel jobs to run. Defaults to 8 . random_state : The random state to use for the UMAP algorithm. Citations @misc{mcinnes2020umapuniformmanifoldapproximation,\n title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n author={Leland McInnes and John Healy and James Melville},\n year={2020},\n eprint={1802.03426},\n archivePrefix={arXiv},\n primaryClass={stat.ML},\n url={https://arxiv.org/abs/1802.03426},\n}\n Source code in src/distilabel/steps/clustering/umap.py class UMAP(GlobalStep):\n r\"\"\"UMAP is a general purpose manifold learning and dimension reduction algorithm.\n\n This is a `GlobalStep` that reduces the dimensionality of the embeddings using. Visit\n the `TextClustering` step for an example of use. The trained model is saved as an artifact\n when creating a distiset and pushing it to the Hugging Face Hub.\n\n Input columns:\n - embedding (`List[float]`): The original embeddings we want to reduce the dimension.\n\n Output columns:\n - projection (`List[float]`): Embedding reduced to the number of components specified,\n the size of the new embeddings will be determined by the `n_components`.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`UMAP repository`](https://github.com/lmcinnes/umap/tree/master)\n - [`UMAP documentation`](https://umap-learn.readthedocs.io/en/latest/)\n\n Attributes:\n - n_components: The dimension of the space to embed into. This defaults to 2 to\n provide easy visualization (that's probably what you want), but can\n reasonably be set to any integer value in the range 2 to 100.\n - metric: The metric to use to compute distances in high dimensional space.\n Visit UMAP's documentation for more information. Defaults to `euclidean`.\n - n_jobs: The number of parallel jobs to run. Defaults to `8`.\n - random_state: The random state to use for the UMAP algorithm.\n\n Runtime parameters:\n - `n_components`: The dimension of the space to embed into. This defaults to 2 to\n provide easy visualization (that's probably what you want), but can\n reasonably be set to any integer value in the range 2 to 100.\n - `metric`: The metric to use to compute distances in high dimensional space.\n Visit UMAP's documentation for more information. Defaults to `euclidean`.\n - `n_jobs`: The number of parallel jobs to run. Defaults to `8`.\n - `random_state`: The random state to use for the UMAP algorithm.\n\n Citations:\n ```\n @misc{mcinnes2020umapuniformmanifoldapproximation,\n title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n author={Leland McInnes and John Healy and James Melville},\n year={2020},\n eprint={1802.03426},\n archivePrefix={arXiv},\n primaryClass={stat.ML},\n url={https://arxiv.org/abs/1802.03426},\n }\n ```\n \"\"\"\n\n n_components: Optional[RuntimeParameter[int]] = Field(\n default=2,\n description=(\n \"The dimension of the space to embed into. This defaults to 2 to \"\n \"provide easy visualization, but can reasonably be set to any \"\n \"integer value in the range 2 to 100.\"\n ),\n )\n metric: Optional[RuntimeParameter[str]] = Field(\n default=\"euclidean\",\n description=(\n \"The metric to use to compute distances in high dimensional space. \"\n \"Visit UMAP's documentation for more information.\"\n ),\n )\n n_jobs: Optional[RuntimeParameter[int]] = Field(\n default=8, description=\"The number of parallel jobs to run.\"\n )\n random_state: Optional[RuntimeParameter[int]] = Field(\n default=None, description=\"The random state to use for the UMAP algorithm.\"\n )\n\n _umap: Optional[\"_UMAP\"] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if importlib.util.find_spec(\"umap\") is None:\n raise ImportError(\n \"`umap` package is not installed. Please install it using `pip install umap-learn`.\"\n )\n from umap import UMAP as _UMAP\n\n self._umap = _UMAP(\n n_components=self.n_components,\n metric=self.metric,\n n_jobs=self.n_jobs,\n random_state=self.random_state,\n )\n\n def unload(self) -> None:\n self._umap = None\n\n @property\n def inputs(self) -> List[str]:\n return [\"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"projection\"]\n\n def _save_model(self, model: Any) -> None:\n import joblib\n\n def save_model(path):\n with open(str(path / \"UMAP.joblib\"), \"wb\") as f:\n joblib.dump(model, f)\n\n self.save_artifact(\n name=\"UMAP_model\",\n write_function=lambda path: save_model(path),\n metadata={\n \"n_components\": self.n_components,\n \"metric\": self.metric,\n },\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n # Shape of the embeddings is (n_samples, n_features)\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start UMAP training...\")\n mapper = self._umap.fit(embeddings)\n # Shape of the projection will be (n_samples, n_components)\n for input, projection in zip(inputs, mapper.embedding_):\n input[\"projection\"] = projection\n\n self._save_model(mapper)\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.CombineOutputs","title":"CombineOutputs ","text":" Bases: Step Combine the outputs of several upstream steps. CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs. Input columns - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. Output columns - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. Categories Examples: Combine dictionaries of a dataset:\n\n```python\nfrom distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n)\n# [\n# {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n# {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n```\n\nCombine upstream steps outputs in a pipeline:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n```\n Source code in src/distilabel/steps/columns/combine.py class CombineOutputs(Step):\n \"\"\"Combine the outputs of several upstream steps.\n\n `CombineOutputs` is a `Step` that takes the outputs of several upstream steps and combines\n them to generate a new dictionary with all keys/columns of the upstream steps outputs.\n\n Input columns:\n - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n Output columns:\n - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n Categories:\n - columns\n\n Examples:\n\n Combine dictionaries of a dataset:\n\n ```python\n from distilabel.steps import CombineOutputs\n\n combine_outputs = CombineOutputs()\n combine_outputs.load()\n\n result = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n )\n # [\n # {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n # {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n # ]\n ```\n\n Combine upstream steps outputs in a pipeline:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import CombineOutputs\n\n with Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n ```\n \"\"\"\n\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n combined_outputs = []\n for output_dicts in zip(*inputs):\n combined_dict = {}\n for output_dict in output_dicts:\n combined_dict.update(\n {\n k: v\n for k, v in output_dict.items()\n if k != DISTILABEL_METADATA_KEY\n }\n )\n\n if any(\n DISTILABEL_METADATA_KEY in output_dict for output_dict in output_dicts\n ):\n combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n *output_dicts\n )\n combined_outputs.append(combined_dict)\n\n yield combined_outputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering","title":"DeitaFiltering ","text":" Bases: GlobalStep Filter dataset rows using DEITA filtering strategy. Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description data_budget RuntimeParameter[int] The desired size of the dataset after filtering. diversity_threshold RuntimeParameter[float] If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9 . normalize_embeddings RuntimeParameter[bool] Whether to normalize the embeddings before computing the cosine distance. Defaults to True . Runtime parameters data_budget : The desired size of the dataset after filtering. diversity_threshold : If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Input columns - evol_instruction_score (
float ): The score of the instruction generated by ComplexityScorer step. - evol_response_score (
float ): The score of the response generated by QualityScorer step. - embedding (
List[float] ): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step. Output columns - deita_score (
float ): The DEITA score for the instruction-response pair. - deita_score_computed_with (
List[str] ): The scores used to compute the DEITA score. - nearest_neighbor_distance (
float ): The cosine distance between the embeddings of the instruction-response pair. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Filter the dataset based on the DEITA score and the cosine distance between the embeddings: from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/deita.py class DeitaFiltering(GlobalStep):\n \"\"\"Filter dataset rows using DEITA filtering strategy.\n\n Filter the dataset based on the DEITA score and the cosine distance between the embeddings.\n It's an implementation of the filtering step from the paper 'What Makes Good Data\n for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n data_budget: The desired size of the dataset after filtering.\n diversity_threshold: If a row has a cosine distance with respect to it's nearest\n neighbor greater than this value, it will be included in the filtered dataset.\n Defaults to `0.9`.\n normalize_embeddings: Whether to normalize the embeddings before computing the cosine\n distance. Defaults to `True`.\n\n Runtime parameters:\n - `data_budget`: The desired size of the dataset after filtering.\n - `diversity_threshold`: If a row has a cosine distance with respect to it's nearest\n neighbor greater than this value, it will be included in the filtered dataset.\n\n Input columns:\n - evol_instruction_score (`float`): The score of the instruction generated by\n `ComplexityScorer` step.\n - evol_response_score (`float`): The score of the response generated by\n `QualityScorer` step.\n - embedding (`List[float]`): The embedding generated for the conversation of the\n instruction-response pair using `GenerateEmbeddings` step.\n\n Output columns:\n - deita_score (`float`): The DEITA score for the instruction-response pair.\n - deita_score_computed_with (`List[str]`): The scores used to compute the DEITA\n score.\n - nearest_neighbor_distance (`float`): The cosine distance between the embeddings\n of the instruction-response pair.\n\n Categories:\n - filtering\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Filter the dataset based on the DEITA score and the cosine distance between the embeddings:\n\n ```python\n from distilabel.steps import DeitaFiltering\n\n deita_filtering = DeitaFiltering(data_budget=1)\n\n deita_filtering.load()\n\n result = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n )\n # >>> result\n # [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n data_budget: RuntimeParameter[int] = Field(\n default=None, description=\"The desired size of the dataset after filtering.\"\n )\n diversity_threshold: RuntimeParameter[float] = Field(\n default=0.9,\n description=\"If a row has a cosine distance with respect to it's nearest neighbor\"\n \" greater than this value, it will be included in the filtered dataset.\",\n )\n normalize_embeddings: RuntimeParameter[bool] = Field(\n default=True,\n description=\"Whether to normalize the embeddings before computing the cosine distance.\",\n )\n distance_metric: RuntimeParameter[Literal[\"cosine\", \"manhattan\"]] = Field(\n default=\"cosine\",\n description=\"The distance metric to use. Currently only 'cosine' is supported.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"evol_instruction_score\", \"evol_response_score\", \"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"deita_score\", \"nearest_neighbor_distance\", \"deita_score_computed_with\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n embeddings.\n\n Args:\n inputs: The input data.\n\n Returns:\n The filtered dataset.\n \"\"\"\n inputs = self._compute_deita_score(inputs)\n inputs = self._compute_nearest_neighbor(inputs)\n inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n selected_rows = []\n for input in inputs:\n if len(selected_rows) >= self.data_budget: # type: ignore\n break\n if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n selected_rows.append(input)\n yield selected_rows\n\n def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n the product of the instruction score and the response score.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the DEITA score computed.\n \"\"\"\n for input_ in inputs:\n evol_instruction_score = input_.get(\"evol_instruction_score\")\n evol_response_score = input_.get(\"evol_response_score\")\n\n if evol_instruction_score and evol_response_score:\n deita_score = evol_instruction_score * evol_response_score\n score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n elif evol_instruction_score:\n self._logger.warning(\n \"Response score is missing for the instruction-response pair. Using\"\n \" instruction score as DEITA score.\"\n )\n deita_score = evol_instruction_score\n score_computed_with = [\"evol_instruction_score\"]\n elif evol_response_score:\n self._logger.warning(\n \"Instruction score is missing for the instruction-response pair. Using\"\n \" response score as DEITA score.\"\n )\n deita_score = evol_response_score\n score_computed_with = [\"evol_response_score\"]\n else:\n self._logger.warning(\n \"Instruction and response scores are missing for the instruction-response\"\n \" pair. Setting DEITA score to 0.\"\n )\n deita_score = 0\n score_computed_with = []\n\n input_.update(\n {\n \"deita_score\": deita_score,\n \"deita_score_computed_with\": score_computed_with,\n }\n )\n return inputs\n\n def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n pairs and the nearest neighbor.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the cosine distance computed.\n \"\"\"\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n if self.normalize_embeddings:\n embeddings = self._normalize_embeddings(embeddings)\n self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n if self.distance_metric == \"cosine\":\n self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n distances = self._cosine_distance(embeddings)\n else:\n self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n distances = self._manhattan_distance(embeddings)\n\n for distance, input in zip(distances, inputs):\n input[\"nearest_neighbor_distance\"] = distance\n return inputs\n\n def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n \"\"\"Normalize the embeddings.\n\n Args:\n embeddings: The embeddings to normalize.\n\n Returns:\n The normalized embeddings.\n \"\"\"\n self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n return embeddings / norms\n\n def _cosine_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the cosine distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The cosine distance between the embeddings.\n \"\"\"\n cosine_similarity = np.dot(embeddings, embeddings.T)\n cosine_distance = 1 - cosine_similarity\n # Ignore self-distance\n np.fill_diagonal(cosine_distance, np.inf)\n return np.min(cosine_distance, axis=1)\n\n def _manhattan_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the manhattan distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The manhattan distance between the embeddings.\n \"\"\"\n manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n # Ignore self-distance\n np.fill_diagonal(manhattan_distance, np.inf)\n return np.min(manhattan_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering.process","title":"process(inputs) ","text":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepOutput The filtered dataset. Source code in src/distilabel/steps/deita.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n embeddings.\n\n Args:\n inputs: The input data.\n\n Returns:\n The filtered dataset.\n \"\"\"\n inputs = self._compute_deita_score(inputs)\n inputs = self._compute_nearest_neighbor(inputs)\n inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n selected_rows = []\n for input in inputs:\n if len(selected_rows) >= self.data_budget: # type: ignore\n break\n if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n selected_rows.append(input)\n yield selected_rows\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_deita_score","title":"_compute_deita_score(inputs) ","text":"Computes the DEITA score for each instruction-response pair. The DEITA score is the product of the instruction score and the response score. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepInput The input data with the DEITA score computed. Source code in src/distilabel/steps/deita.py def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n the product of the instruction score and the response score.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the DEITA score computed.\n \"\"\"\n for input_ in inputs:\n evol_instruction_score = input_.get(\"evol_instruction_score\")\n evol_response_score = input_.get(\"evol_response_score\")\n\n if evol_instruction_score and evol_response_score:\n deita_score = evol_instruction_score * evol_response_score\n score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n elif evol_instruction_score:\n self._logger.warning(\n \"Response score is missing for the instruction-response pair. Using\"\n \" instruction score as DEITA score.\"\n )\n deita_score = evol_instruction_score\n score_computed_with = [\"evol_instruction_score\"]\n elif evol_response_score:\n self._logger.warning(\n \"Instruction score is missing for the instruction-response pair. Using\"\n \" response score as DEITA score.\"\n )\n deita_score = evol_response_score\n score_computed_with = [\"evol_response_score\"]\n else:\n self._logger.warning(\n \"Instruction and response scores are missing for the instruction-response\"\n \" pair. Setting DEITA score to 0.\"\n )\n deita_score = 0\n score_computed_with = []\n\n input_.update(\n {\n \"deita_score\": deita_score,\n \"deita_score_computed_with\": score_computed_with,\n }\n )\n return inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_nearest_neighbor","title":"_compute_nearest_neighbor(inputs) ","text":"Computes the cosine distance between the embeddings of the instruction-response pairs and the nearest neighbor. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepInput The input data with the cosine distance computed. Source code in src/distilabel/steps/deita.py def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n pairs and the nearest neighbor.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the cosine distance computed.\n \"\"\"\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n if self.normalize_embeddings:\n embeddings = self._normalize_embeddings(embeddings)\n self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n if self.distance_metric == \"cosine\":\n self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n distances = self._cosine_distance(embeddings)\n else:\n self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n distances = self._manhattan_distance(embeddings)\n\n for distance, input in zip(distances, inputs):\n input[\"nearest_neighbor_distance\"] = distance\n return inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._normalize_embeddings","title":"_normalize_embeddings(embeddings) ","text":"Normalize the embeddings. Parameters: Name Type Description Default embeddings ndarray The embeddings to normalize. required Returns: Type Description ndarray The normalized embeddings. Source code in src/distilabel/steps/deita.py def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n \"\"\"Normalize the embeddings.\n\n Args:\n embeddings: The embeddings to normalize.\n\n Returns:\n The normalized embeddings.\n \"\"\"\n self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n return embeddings / norms\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._cosine_distance","title":"_cosine_distance(embeddings) ","text":"Computes the cosine distance between the embeddings. Parameters: Name Type Description Default embeddings array The embeddings. required Returns: Type Description array The cosine distance between the embeddings. Source code in src/distilabel/steps/deita.py def _cosine_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the cosine distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The cosine distance between the embeddings.\n \"\"\"\n cosine_similarity = np.dot(embeddings, embeddings.T)\n cosine_distance = 1 - cosine_similarity\n # Ignore self-distance\n np.fill_diagonal(cosine_distance, np.inf)\n return np.min(cosine_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._manhattan_distance","title":"_manhattan_distance(embeddings) ","text":"Computes the manhattan distance between the embeddings. Parameters: Name Type Description Default embeddings array The embeddings. required Returns: Type Description array The manhattan distance between the embeddings. Source code in src/distilabel/steps/deita.py def _manhattan_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the manhattan distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The manhattan distance between the embeddings.\n \"\"\"\n manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n # Ignore self-distance\n np.fill_diagonal(manhattan_distance, np.inf)\n return np.min(manhattan_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration","title":"EmbeddingGeneration ","text":" Bases: Step Generate embeddings using an Embeddings model. EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts. Attributes: Name Type Description embeddings Embeddings the Embeddings model used to generate the sentence embeddings. Input columns - text (
str ): The text for which the sentence embedding has to be generated. Output columns - embedding (
List[Union[float, int]] ): the generated sentence embedding. Categories Examples: Generate sentence embeddings with Sentence Transformers: from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n Source code in src/distilabel/steps/embeddings/embedding_generation.py class EmbeddingGeneration(Step):\n \"\"\"Generate embeddings using an `Embeddings` model.\n\n `EmbeddingGeneration` is a `Step` that using an `Embeddings` model generates sentence\n embeddings for the provided input texts.\n\n Attributes:\n embeddings: the `Embeddings` model used to generate the sentence embeddings.\n\n Input columns:\n - text (`str`): The text for which the sentence embedding has to be generated.\n\n Output columns:\n - embedding (`List[Union[float, int]]`): the generated sentence embedding.\n\n Categories:\n - embedding\n\n Examples:\n Generate sentence embeddings with Sentence Transformers:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n from distilabel.steps import EmbeddingGeneration\n\n embedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n )\n\n embedding_generation.load()\n\n result = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n # [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n ```\n\n \"\"\"\n\n embeddings: Embeddings\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"text\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"embedding\", \"model_name\"]\n\n def load(self) -> None:\n \"\"\"Loads the `Embeddings` model.\"\"\"\n super().load()\n\n self.embeddings.load()\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n embeddings = self.embeddings.encode(inputs=[input[\"text\"] for input in inputs])\n for input, embedding in zip(inputs, embeddings):\n input[\"embedding\"] = embedding\n input[\"model_name\"] = self.embeddings.model_name\n yield inputs\n\n def unload(self) -> None:\n super().unload()\n self.embeddings.unload()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration.load","title":"load() ","text":"Loads the Embeddings model. Source code in src/distilabel/steps/embeddings/embedding_generation.py def load(self) -> None:\n \"\"\"Loads the `Embeddings` model.\"\"\"\n super().load()\n\n self.embeddings.load()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour","title":"FaissNearestNeighbour ","text":" Bases: GlobalStep Create a faiss index to get the nearest neighbours. FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row. Attributes: Name Type Description device Optional[RuntimeParameter[Union[int, List[int]]]] the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . string_factory Optional[RuntimeParameter[str]] the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . metric_type Optional[RuntimeParameter[int]] the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . k Optional[RuntimeParameter[int]] the number of nearest neighbours to search for each input row. Defaults to 1 . search_batch_size Optional[RuntimeParameter[int]] the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . train_size Optional[RuntimeParameter[int]] If the index needs a training step, specifies how many vectors will be used to train the index. Runtime parameters device : the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . string_factory : the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . metric_type : the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . k : the number of nearest neighbours to search for each input row. Defaults to 1 . search_batch_size : the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . train_size : If the index needs a training step, specifies how many vectors will be used to train the index. Input columns - embedding (
List[Union[float, int]] ): a sentence embedding. Output columns - nn_indices (
List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. - nn_scores (
List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. Categories References Examples: Generating embeddings and getting the nearest neighbours: from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n Citations @misc{douze2024faisslibrary,\n title={The Faiss library},\n author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n year={2024},\n eprint={2401.08281},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2401.08281},\n}\n Source code in src/distilabel/steps/embeddings/nearest_neighbour.py class FaissNearestNeighbour(GlobalStep):\n \"\"\"Create a `faiss` index to get the nearest neighbours.\n\n `FaissNearestNeighbour` is a `GlobalStep` that creates a `faiss` index using the Hugging\n Face `datasets` library integration, and then gets the nearest neighbours and the scores\n or distance of the nearest neighbours for each input row.\n\n Attributes:\n device: the CUDA device ID or a list of IDs to be used. If negative integer, it\n will use all the available GPUs. Defaults to `None`.\n string_factory: the name of the factory to be used to build the `faiss` index.\n Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n Defaults to `None`.\n metric_type: the metric to be used to measure the distance between the points. It's\n an integer and the recommend way to pass it is importing `faiss` and then passing\n one of `faiss.METRIC_x` variables. Defaults to `None`.\n k: the number of nearest neighbours to search for each input row. Defaults to `1`.\n search_batch_size: the number of rows to include in a search batch. The value can\n be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n to `50`.\n train_size: If the index needs a training step, specifies how many vectors will be\n used to train the index.\n\n Runtime parameters:\n - `device`: the CUDA device ID or a list of IDs to be used. If negative integer,\n it will use all the available GPUs. Defaults to `None`.\n - `string_factory`: the name of the factory to be used to build the `faiss` index.\n Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n Defaults to `None`.\n - `metric_type`: the metric to be used to measure the distance between the points.\n It's an integer and the recommend way to pass it is importing `faiss` and then\n passing one of `faiss.METRIC_x` variables. Defaults to `None`.\n - `k`: the number of nearest neighbours to search for each input row. Defaults to `1`.\n - `search_batch_size`: the number of rows to include in a search batch. The value\n can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n to `50`.\n - `train_size`: If the index needs a training step, specifies how many vectors will\n be used to train the index.\n\n Input columns:\n - embedding (`List[Union[float, int]]`): a sentence embedding.\n\n Output columns:\n - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n in the inputs for the row.\n - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n nearest neighbour in the inputs.\n\n Categories:\n - embedding\n\n References:\n - [`The Faiss library`](https://arxiv.org/abs/2401.08281)\n\n Examples:\n Generating embeddings and getting the nearest neighbours:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n from distilabel.pipeline import Pipeline\n from distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\n with Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n ```\n\n Citations:\n ```\n @misc{douze2024faisslibrary,\n title={The Faiss library},\n author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n year={2024},\n eprint={2401.08281},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2401.08281},\n }\n ```\n \"\"\"\n\n device: Optional[RuntimeParameter[Union[int, List[int]]]] = Field(\n default=None,\n description=\"The CUDA device ID or a list of IDs to be used. If negative integer,\"\n \" it will use all the available GPUs.\",\n )\n string_factory: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The name of the factory to be used to build the `faiss` index.\"\n \"Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\",\n )\n metric_type: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"The metric to be used to measure the distance between the points. It's\"\n \" an integer and the recommend way to pass it is importing `faiss` and thenpassing\"\n \" one of `faiss.METRIC_x` variables.\",\n )\n k: Optional[RuntimeParameter[int]] = Field(\n default=1,\n description=\"The number of nearest neighbours to search for each input row.\",\n )\n search_batch_size: Optional[RuntimeParameter[int]] = Field(\n default=50,\n description=\"The number of rows to include in a search batch. The value can be adjusted\"\n \" to maximize the resources usage or to avoid OOM issues.\",\n )\n train_size: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"If the index needs a training step, specifies how many vectors will be used to train the index.\",\n )\n\n def load(self) -> None:\n super().load()\n\n if importlib.util.find_spec(\"faiss\") is None:\n raise ImportError(\n \"`faiss` package is not installed. Please install it using `pip install\"\n \" faiss-cpu` or `pip install faiss-gpu`.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"nn_indices\", \"nn_scores\"]\n\n def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n \"\"\"Builds a `faiss` index using `datasets` integration.\n\n Args:\n inputs: a list of dictionaries.\n\n Returns:\n The build `datasets.Dataset` with its `faiss` index.\n \"\"\"\n dataset = Dataset.from_list(inputs)\n if self.train_size is not None and self.string_factory:\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n dataset.add_faiss_index(\n column=\"embedding\",\n device=self.device, # type: ignore\n string_factory=self.string_factory,\n metric_type=self.metric_type,\n train_size=self.train_size,\n )\n return dataset\n\n def _save_index(self, dataset: Dataset) -> None:\n \"\"\"Save the generated Faiss index as an artifact of the step.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n \"\"\"\n self.save_artifact(\n name=\"faiss_index\",\n write_function=lambda path: dataset.save_faiss_index(\n index_name=\"embedding\", file=path / \"index.faiss\"\n ),\n metadata={\n \"num_rows\": len(dataset),\n \"embedding_dim\": len(dataset[0][\"embedding\"]),\n },\n )\n\n def _search(self, dataset: Dataset) -> Dataset:\n \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n\n Returns:\n The updated dataset containing the top `k` nearest neighbours for each row,\n as well as the score or distance.\n \"\"\"\n\n def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n queries = np.array(examples[\"embedding\"])\n results = dataset.search_batch(\n index_name=\"embedding\",\n queries=queries,\n k=self.k + 1, # type: ignore\n )\n examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n return examples\n\n return dataset.map(\n add_search_results, batched=True, batch_size=self.search_batch_size\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n dataset = self._build_index(inputs)\n dataset_with_search_results = self._search(dataset)\n self._save_index(dataset)\n yield dataset_with_search_results.to_list()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._build_index","title":"_build_index(inputs) ","text":"Builds a faiss index using datasets integration. Parameters: Name Type Description Default inputs List[Dict[str, Any]] a list of dictionaries. required Returns: Type Description Dataset The build datasets.Dataset with its faiss index. Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n \"\"\"Builds a `faiss` index using `datasets` integration.\n\n Args:\n inputs: a list of dictionaries.\n\n Returns:\n The build `datasets.Dataset` with its `faiss` index.\n \"\"\"\n dataset = Dataset.from_list(inputs)\n if self.train_size is not None and self.string_factory:\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n dataset.add_faiss_index(\n column=\"embedding\",\n device=self.device, # type: ignore\n string_factory=self.string_factory,\n metric_type=self.metric_type,\n train_size=self.train_size,\n )\n return dataset\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._save_index","title":"_save_index(dataset) ","text":"Save the generated Faiss index as an artifact of the step. Parameters: Name Type Description Default dataset Dataset the dataset with the faiss index built. required Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _save_index(self, dataset: Dataset) -> None:\n \"\"\"Save the generated Faiss index as an artifact of the step.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n \"\"\"\n self.save_artifact(\n name=\"faiss_index\",\n write_function=lambda path: dataset.save_faiss_index(\n index_name=\"embedding\", file=path / \"index.faiss\"\n ),\n metadata={\n \"num_rows\": len(dataset),\n \"embedding_dim\": len(dataset[0][\"embedding\"]),\n },\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._search","title":"_search(dataset) ","text":"Search the top k nearest neighbours for each row in the dataset. Parameters: Name Type Description Default dataset Dataset the dataset with the faiss index built. required Returns: Type Description Dataset The updated dataset containing the top k nearest neighbours for each row, Dataset as well as the score or distance. Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _search(self, dataset: Dataset) -> Dataset:\n \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n\n Returns:\n The updated dataset containing the top `k` nearest neighbours for each row,\n as well as the score or distance.\n \"\"\"\n\n def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n queries = np.array(examples[\"embedding\"])\n results = dataset.search_batch(\n index_name=\"embedding\",\n queries=queries,\n k=self.k + 1, # type: ignore\n )\n examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n return examples\n\n return dataset.map(\n add_search_results, batched=True, batch_size=self.search_batch_size\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingDedup","title":"EmbeddingDedup ","text":" Bases: GlobalStep Deduplicates text using embeddings. EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour ) using the nn_indices and nn_scores , determine the texts that are duplicate. Attributes: Name Type Description threshold Optional[RuntimeParameter[float]] the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9 . Runtime Parameters threshold : the threshold to consider 2 examples as duplicates. Input columns - nn_indices (
List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. - nn_scores (
List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. Output columns - keep_row_after_embedding_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. Categories Examples: Deduplicate a list of texts using embedding information:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n```\n Source code in src/distilabel/steps/filtering/embedding.py class EmbeddingDedup(GlobalStep):\n \"\"\"Deduplicates text using embeddings.\n\n `EmbeddingDedup` is a Step that detects near-duplicates in datasets, using\n embeddings to compare the similarity between the texts. The typical workflow with this step\n would include having a dataset with embeddings precomputed, and then (possibly using the\n `FaissNearestNeighbour`) using the `nn_indices` and `nn_scores`, determine the texts that\n are duplicate.\n\n Attributes:\n threshold: the threshold to consider 2 examples as duplicates.\n It's dependent on the type of index that was used to generate the embeddings.\n For example, if the embeddings were generated using cosine similarity, a threshold\n of `0.9` would make all the texts with a cosine similarity above the value\n duplicates. Higher values detect less duplicates in such an index, but that should\n be taken into account when building it. Defaults to `0.9`.\n\n Runtime Parameters:\n - `threshold`: the threshold to consider 2 examples as duplicates.\n\n Input columns:\n - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n in the inputs for the row.\n - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n nearest neighbour in the inputs.\n\n Output columns:\n - keep_row_after_embedding_filtering (`bool`): boolean indicating if the piece `text` is\n not a duplicate i.e. this text should be kept.\n\n Categories:\n - filtering\n\n Examples:\n\n Deduplicate a list of texts using embedding information:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import EmbeddingDedup\n from distilabel.steps import LoadDataFromDicts\n\n with Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n ```\n \"\"\"\n\n threshold: Optional[RuntimeParameter[float]] = Field(\n default=0.9,\n description=\"The threshold to consider 2 examples as duplicates. It's dependent \"\n \"on the type of index that was used to generate the embeddings. For example, if \"\n \"the embeddings were generated using cosine similarity, a threshold of `0.9` \"\n \"would make all the texts with a cosine similarity above the value duplicates. \"\n \"Higher values detect less duplicates in such an index, but that should be \"\n \"taken into account when building it.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"nn_scores\", \"nn_indices\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"keep_row_after_embedding_filtering\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n rows_to_remove = set()\n\n for input in track(inputs, description=\"Running Embedding deduplication...\"):\n input[\"keep_row_after_embedding_filtering\"] = True\n indices_scores = np.array(input[\"nn_scores\"]) > self.threshold\n indices = np.array(input[\"nn_indices\"])[indices_scores]\n if len(indices) > 0: # If there are any rows found over the threshold\n rows_to_remove.update(list(indices))\n\n # Remove duplicates and get the list of rows to remove\n for idx in rows_to_remove:\n inputs[idx][\"keep_row_after_embedding_filtering\"] = False\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.MinHashDedup","title":"MinHashDedup ","text":" Bases: Step Deduplicates text using MinHash and MinHashLSH . MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH . 4. Check if the MinHash is already in the LSH , if so, it is a duplicate. Attributes: Name Type Description num_perm int the number of permutations to use. Defaults to 128 . seed int the seed to use for the MinHash. Defaults to 1 . tokenizer Literal['words', 'ngrams'] the tokenizer to use. Available ones are words or ngrams . If words is selected, it tokenizes the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n ). Defaults to words . n Optional[int] the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\" . Defaults to 5 . threshold float the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9 . storage Literal['dict', 'disk'] the storage to use for the LSH. Can be dict to store the index in memory, or disk . Keep in mind, disk is an experimental feature not defined in datasketch , that is based on DiskCache's Index class. It should work as a dict , but backed by disk, but depending on the system it can be slower. Defaults to dict . Input columns - text (
str ): the texts to be filtered. Output columns - keep_row_after_minhash_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. Categories References datasketch documentation - Identifying and Filtering Near-Duplicate Documents
- Diskcache's Index
Examples: Deduplicate a list of texts using MinHash and MinHashLSH:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n```\n Source code in src/distilabel/steps/filtering/minhash.py class MinHashDedup(Step):\n \"\"\"Deduplicates text using `MinHash` and `MinHashLSH`.\n\n `MinHashDedup` is a Step that detects near-duplicates in datasets. The idea roughly translates\n to the following steps:\n 1. Tokenize the text into words or ngrams.\n 2. Create a `MinHash` for each text.\n 3. Store the `MinHashes` in a `MinHashLSH`.\n 4. Check if the `MinHash` is already in the `LSH`, if so, it is a duplicate.\n\n Attributes:\n num_perm: the number of permutations to use. Defaults to `128`.\n seed: the seed to use for the MinHash. Defaults to `1`.\n tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.\n If `words` is selected, it tokenizes the text into words using nltk's\n word tokenizer. `ngram` estimates the ngrams (together with the size\n `n`). Defaults to `words`.\n n: the size of the ngrams to use. Only relevant if `tokenizer=\"ngrams\"`. Defaults to `5`.\n threshold: the threshold to consider two MinHashes as duplicates.\n Values closer to 0 detect more duplicates. Defaults to `0.9`.\n storage: the storage to use for the LSH. Can be `dict` to store the index\n in memory, or `disk`. Keep in mind, `disk` is an experimental feature\n not defined in `datasketch`, that is based on DiskCache's `Index` class.\n It should work as a `dict`, but backed by disk, but depending on the system\n it can be slower. Defaults to `dict`.\n\n Input columns:\n - text (`str`): the texts to be filtered.\n\n Output columns:\n - keep_row_after_minhash_filtering (`bool`): boolean indicating if the piece `text` is\n not a duplicate i.e. this text should be kept.\n\n Categories:\n - filtering\n\n References:\n - [`datasketch documentation`](https://ekzhu.github.io/datasketch/lsh.html)\n - [Identifying and Filtering Near-Duplicate Documents](https://cs.brown.edu/courses/cs253/papers/nearduplicate.pdf)\n - [Diskcache's Index](https://grantjenks.com/docs/diskcache/api.html#diskcache.Index)\n\n Examples:\n\n Deduplicate a list of texts using MinHash and MinHashLSH:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import MinHashDedup\n from distilabel.steps import LoadDataFromDicts\n\n with Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n ```\n \"\"\"\n\n num_perm: int = 128\n seed: int = 1\n tokenizer: Literal[\"words\", \"ngrams\"] = \"words\"\n n: Optional[int] = 5\n threshold: float = 0.9\n storage: Literal[\"dict\", \"disk\"] = \"dict\"\n\n _hasher: Union[\"MinHash\", None] = PrivateAttr(None)\n _tokenizer: Union[Callable, None] = PrivateAttr(None)\n _lhs: Union[\"MinHashLSH\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if not importlib.import_module(\"datasketch\"):\n raise ImportError(\n \"`datasketch` is needed to deduplicate with MinHash, but is not installed. \"\n \"Please install it using `pip install datasketch`.\"\n )\n from datasketch import MinHash\n\n from distilabel.steps.filtering._datasketch import MinHashLSH\n\n self._hasher = MinHash.bulk\n self._lsh = MinHashLSH(\n num_perm=self.num_perm,\n threshold=self.threshold,\n storage_config={\"type\": self.storage},\n )\n\n if self.tokenizer == \"words\":\n if not importlib.import_module(\"nltk\"):\n raise ImportError(\n \"`nltk` is needed to tokenize based on words, but is not installed. \"\n \"Please install it using `pip install nltk`. Then run `nltk.download('punkt_tab')`.\"\n )\n self._tokenizer = tokenized_on_words\n else:\n self._tokenizer = partial(tokenize_on_ngrams, n=self.n)\n\n def unload(self) -> None:\n super().unload()\n # In case of LSH being stored in disk, we need to close the file.\n if self.storage == \"disk\":\n self._lsh.close()\n\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"keep_row_after_minhash_filtering\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n tokenized_texts = []\n for input in inputs:\n tokenized_texts.append(self._tokenizer([input[self.inputs[0]]])[0])\n\n minhashes = self._hasher(\n tokenized_texts, num_perm=self.num_perm, seed=self.seed\n )\n\n for input, minhash in zip(inputs, minhashes):\n # Check if the text is already in the LSH index\n if self._lsh.query(minhash):\n input[\"keep_row_after_minhash_filtering\"] = False\n else:\n self._lsh.insert(str(uuid.uuid4()), minhash)\n input[\"keep_row_after_minhash_filtering\"] = True\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate","title":"ConversationTemplate ","text":" Bases: Step Generate a conversation template from an instruction and a response. Input columns - instruction (
str ): The instruction to be used in the conversation. - response (
str ): The response to be used in the conversation. Output columns - conversation (
ChatType ): The conversation template. Categories Examples: Create a conversation from an instruction and a response: from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n Source code in src/distilabel/steps/formatting/conversation.py class ConversationTemplate(Step):\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Input columns:\n - instruction (`str`): The instruction to be used in the conversation.\n - response (`str`): The response to be used in the conversation.\n\n Output columns:\n - conversation (`ChatType`): The conversation template.\n\n Categories:\n - format\n - chat\n - template\n\n Examples:\n Create a conversation from an instruction and a response:\n\n ```python\n from distilabel.steps import ConversationTemplate\n\n conv_template = ConversationTemplate()\n conv_template.load()\n\n result = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The instruction and response.\"\"\"\n return [\"instruction\", \"response\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The conversation template.\"\"\"\n return [\"conversation\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Args:\n inputs: The input data.\n\n Yields:\n The input data with the conversation template.\n \"\"\"\n for input in inputs:\n input[\"conversation\"] = [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.inputs","title":"inputs: StepColumns property ","text":"The instruction and response. "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.outputs","title":"outputs: StepColumns property ","text":"The conversation template. "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.process","title":"process(inputs) ","text":"Generate a conversation template from an instruction and a response. Parameters: Name Type Description Default inputs StepInput The input data. required Yields: Type Description StepOutput The input data with the conversation template. Source code in src/distilabel/steps/formatting/conversation.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Args:\n inputs: The input data.\n\n Yields:\n The input data with the conversation template.\n \"\"\"\n for input in inputs:\n input[\"conversation\"] = [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO","title":"FormatChatGenerationDPO ","text":" Bases: Step Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook ., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Note The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. Input columns - messages (
List[Dict[str, str]] ): The conversation messages. - generations (
List[str] ): The generations produced by the LLM . - generation_models (
List[str] , optional): The model names used to generate the generations , only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. - ratings (
List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . Output columns - prompt (
str ): The user message used to generate the generations with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - chosen (
List[Dict[str, str]] ): The chosen generation based on the ratings . - chosen_model (
str , optional): The model name used to generate the chosen generation, if the generation_models are available. - chosen_rating (
float ): The rating of the chosen generation. - rejected (
List[Dict[str, str]] ): The rejected generation based on the ratings . - rejected_model (
str , optional): The model name used to generate the rejected generation, if the generation_models are available. - rejected_rating (
float ): The rating of the rejected generation. Categories - format
- chat-generation
- preference
- messages
- generations
Examples: Format your dataset for DPO fine tuning: from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n Source code in src/distilabel/steps/formatting/dpo.py class FormatChatGenerationDPO(Step):\n \"\"\"Format the output of a combination of a `ChatGeneration` + a preference task for Direct Preference Optimization (DPO).\n\n `FormatChatGenerationDPO` is a `Step` that formats the output of the combination of a `ChatGeneration`\n task with a preference `Task` i.e. a task generating `ratings` such as `UltraFeedback` following the standard\n formatting from frameworks such as `axolotl` or `alignment-handbook`., so that those are used to rank the\n existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n\n Note:\n The `messages` column should contain at least one message from the user, the `generations`\n column should contain at least two generations, the `ratings` column should contain the same\n number of ratings as generations.\n\n Input columns:\n - messages (`List[Dict[str, str]]`): The conversation messages.\n - generations (`List[str]`): The generations produced by the `LLM`.\n - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n only available if the `model_name` from the `ChatGeneration` task/s is combined into a single\n column named this way, otherwise, it will be ignored.\n - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n task such as `UltraFeedback`.\n\n Output columns:\n - prompt (`str`): The user message used to generate the `generations` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n if the `generation_models` are available.\n - chosen_rating (`float`): The rating of the `chosen` generation.\n - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n if the `generation_models` are available.\n - rejected_rating (`float`): The rating of the `rejected` generation.\n\n Categories:\n - format\n - chat-generation\n - preference\n - messages\n - generations\n\n Examples:\n Format your dataset for DPO fine tuning:\n\n ```python\n from distilabel.steps import FormatChatGenerationDPO\n\n format_dpo = FormatChatGenerationDPO()\n format_dpo.load()\n\n # NOTE: \"generation_models\" can be added optionally.\n result = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n # 'generations': ['4', '5', '6'],\n # 'ratings': [1, 0, -1],\n # 'prompt': \"What's 2+2?\",\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'chosen_rating': 1,\n # 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n # 'rejected_rating': -1\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `messages`, `generations`,\n and `ratings`.\"\"\"\n return [\"messages\", \"generations\", \"ratings\"]\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case is: `generation_models`.\"\"\"\n return [\"generation_models\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n is available.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\n \"prompt\",\n \"prompt_id\",\n \"chosen\",\n \"chosen_model\",\n \"chosen_rating\",\n \"rejected\",\n \"rejected_model\",\n \"rejected_rating\",\n ]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: messages , generations , and ratings . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case is: generation_models . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating . Both the chosen_model and rejected_model being optional and only used if generation_models is available. Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the DPO formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the DPO standard. Source code in src/distilabel/steps/formatting/dpo.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO","title":"FormatTextGenerationDPO ","text":" Bases: Step Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings , so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook . Note The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generations , if available. - instruction (
str ): The instruction used to generate the generations with the LLM . - generations (
List[str] ): The generations produced by the LLM . - generation_models (
List[str] , optional): The model names used to generate the generations , only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. - ratings (
List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . Output columns - prompt (
str ): The instruction used to generate the generations with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - chosen (
List[Dict[str, str]] ): The chosen generation based on the ratings . - chosen_model (
str , optional): The model name used to generate the chosen generation, if the generation_models are available. - chosen_rating (
float ): The rating of the chosen generation. - rejected (
List[Dict[str, str]] ): The rejected generation based on the ratings . - rejected_model (
str , optional): The model name used to generate the rejected generation, if the generation_models are available. - rejected_rating (
float ): The rating of the rejected generation. Categories - format
- text-generation
- preference
- instruction
- generations
Examples: Format your dataset for DPO fine tuning: from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# { 'instruction': \"What's 2+2?\",\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n Source code in src/distilabel/steps/formatting/dpo.py class FormatTextGenerationDPO(Step):\n \"\"\"Format the output of your LLMs for Direct Preference Optimization (DPO).\n\n `FormatTextGenerationDPO` is a `Step` that formats the output of the combination of a `TextGeneration`\n task with a preference `Task` i.e. a task generating `ratings`, so that those are used to rank the\n existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n Use this step to transform the output of a combination of a `TextGeneration` + a preference task such as\n `UltraFeedback` following the standard formatting from frameworks such as `axolotl` or `alignment-handbook`.\n\n Note:\n The `generations` column should contain at least two generations, the `ratings` column should\n contain the same number of ratings as generations.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generations`, if available.\n - instruction (`str`): The instruction used to generate the `generations` with the `LLM`.\n - generations (`List[str]`): The generations produced by the `LLM`.\n - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n only available if the `model_name` from the `TextGeneration` task/s is combined into a single\n column named this way, otherwise, it will be ignored.\n - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n task such as `UltraFeedback`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generations` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n if the `generation_models` are available.\n - chosen_rating (`float`): The rating of the `chosen` generation.\n - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n if the `generation_models` are available.\n - rejected_rating (`float`): The rating of the `rejected` generation.\n\n Categories:\n - format\n - text-generation\n - preference\n - instruction\n - generations\n\n Examples:\n Format your dataset for DPO fine tuning:\n\n ```python\n from distilabel.steps import FormatTextGenerationDPO\n\n format_dpo = FormatTextGenerationDPO()\n format_dpo.load()\n\n # NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\n result = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n )\n # >>> result\n # [\n # { 'instruction': \"What's 2+2?\",\n # 'generations': ['4', '5', '6'],\n # 'ratings': [1, 0, -1],\n # 'prompt': \"What's 2+2?\",\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'chosen_rating': 1,\n # 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n # 'rejected_rating': -1\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, `generations`,\n and `ratings`.\"\"\"\n return {\n \"system_prompt\": False,\n \"instruction\": True,\n \"generations\": True,\n \"generation_models\": False,\n \"ratings\": True,\n }\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case are: `system_prompt`, and `generation_models`.\"\"\"\n return [\"system_prompt\", \"generation_models\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n is available.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\n \"prompt\",\n \"prompt_id\",\n \"chosen\",\n \"chosen_model\",\n \"chosen_rating\",\n \"rejected\",\n \"rejected_model\",\n \"rejected_rating\",\n ]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n messages = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n messages.insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n item[\"prompt\"] = item[\"instruction\"]\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , generations , and ratings . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case are: system_prompt , and generation_models . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating . Both the chosen_model and rejected_model being optional and only used if generation_models is available. Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the DPO formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the DPO standard. Source code in src/distilabel/steps/formatting/dpo.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n messages = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n messages.insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n item[\"prompt\"] = item[\"instruction\"]\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT","title":"FormatChatGenerationSFT ","text":" Bases: Step Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generation , if available. - instruction (
str ): The instruction used to generate the generation with the LLM . - generation (
str ): The generation produced by the LLM . Output columns - prompt (
str ): The instruction used to generate the generation with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - messages (
List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. Categories - format
- chat-generation
- instruction
- generation
Examples: Format your dataset for SFT: from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# }\n# ]\n Source code in src/distilabel/steps/formatting/sft.py class FormatChatGenerationSFT(Step):\n \"\"\"Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT).\n\n `FormatChatGenerationSFT` is a `Step` that formats the output of a `ChatGeneration` task for\n Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n or `alignment-handbook`. The output of the `ChatGeneration` task is formatted into a chat-like\n conversation with the `instruction` as the user message and the `generation` as the assistant\n message. Optionally, if the `system_prompt` is available, it is included as the first message\n in the conversation.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generation`, if available.\n - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n - generation (`str`): The generation produced by the `LLM`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n the user message and the `generation` as the assistant message.\n\n Categories:\n - format\n - chat-generation\n - instruction\n - generation\n\n Examples:\n Format your dataset for SFT:\n\n ```python\n from distilabel.steps import FormatChatGenerationSFT\n\n format_sft = FormatChatGenerationSFT()\n format_sft.load()\n\n # NOTE: \"system_prompt\" can be added optionally.\n result = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'generation': '4',\n # 'prompt': 'What's 2+2?',\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n return [\"messages\", \"generation\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\"prompt\", \"prompt_id\", \"messages\"]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = item[\"messages\"] + [\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , and generation . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , messages . Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the SFT formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the SFT standard. Source code in src/distilabel/steps/formatting/sft.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = item[\"messages\"] + [\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT","title":"FormatTextGenerationSFT ","text":" Bases: Step Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generation , if available. - instruction (
str ): The instruction used to generate the generation with the LLM . - generation (
str ): The generation produced by the LLM . Output columns - prompt (
str ): The instruction used to generate the generation with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - messages (
List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. Categories - format
- text-generation
- instruction
- generation
Examples: Format your dataset for SFT fine tuning: from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'What's 2+2?',\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n# }\n# ]\n Source code in src/distilabel/steps/formatting/sft.py class FormatTextGenerationSFT(Step):\n \"\"\"Format the output of a `TextGeneration` task for Supervised Fine-Tuning (SFT).\n\n `FormatTextGenerationSFT` is a `Step` that formats the output of a `TextGeneration` task for\n Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n or `alignment-handbook`. The output of the `TextGeneration` task is formatted into a chat-like\n conversation with the `instruction` as the user message and the `generation` as the assistant\n message. Optionally, if the `system_prompt` is available, it is included as the first message\n in the conversation.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generation`, if available.\n - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n - generation (`str`): The generation produced by the `LLM`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n the user message and the `generation` as the assistant message.\n\n Categories:\n - format\n - text-generation\n - instruction\n - generation\n\n Examples:\n Format your dataset for SFT fine tuning:\n\n ```python\n from distilabel.steps import FormatTextGenerationSFT\n\n format_sft = FormatTextGenerationSFT()\n format_sft.load()\n\n # NOTE: \"system_prompt\" can be added optionally.\n result = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'instruction': 'What's 2+2?',\n # 'generation': '4',\n # 'prompt': 'What's 2+2?',\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n return {\n \"system_prompt\": False,\n \"instruction\": True,\n \"generation\": True,\n }\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case is: `system_prompt`.\"\"\"\n return [\"system_prompt\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\"prompt\", \"prompt_id\", \"messages\"]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = item[\"instruction\"]\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n item[\"messages\"].insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , and generation . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case is: system_prompt . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , messages . Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the SFT formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the SFT standard. Source code in src/distilabel/steps/formatting/sft.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = item[\"instruction\"]\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n item[\"messages\"].insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts","title":"LoadDataFromDicts ","text":" Bases: GeneratorStep Loads a dataset from a list of dictionaries. GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches. Attributes: Name Type Description data List[Dict[str, Any]] The list of dictionaries to load the data from. Runtime parameters batch_size : The batch size to use when processing the data. Output columns - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories Examples: Load data from a list of dictionaries: from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n Source code in src/distilabel/steps/generators/data.py class LoadDataFromDicts(GeneratorStep):\n \"\"\"Loads a dataset from a list of dictionaries.\n\n `GeneratorStep` that loads a dataset from a list of dictionaries and yields it in\n batches.\n\n Attributes:\n data: The list of dictionaries to load the data from.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n\n Output columns:\n - dynamic (based on the keys found on the first dictionary of the list): The columns\n of the dataset.\n\n Categories:\n - load\n\n Examples:\n Load data from a list of dictionaries:\n\n ```python\n from distilabel.steps import LoadDataFromDicts\n\n loader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n )\n loader.load()\n\n result = next(loader.process())\n # >>> result\n # ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n ```\n \"\"\"\n\n data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n if offset:\n self.data = self.data[offset:]\n\n while self.data:\n batch = self.data[: self.batch_size]\n self.data = self.data[self.batch_size :]\n yield (\n batch,\n True if len(self.data) == 0 else False,\n )\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"Returns a list of strings with the names of the columns that the step will generate.\"\"\"\n return list(self.data[0].keys())\n "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.outputs","title":"outputs: List[str] property ","text":"Returns a list of strings with the names of the columns that the step will generate. "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.process","title":"process(offset=0) ","text":"Yields batches from a list of dictionaries. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries as read from the inputs (propagated in batches) GeneratorStepOutput and a flag indicating whether the yield batch is the last one. Source code in src/distilabel/steps/generators/data.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n if offset:\n self.data = self.data[offset:]\n\n while self.data:\n batch = self.data[: self.batch_size]\n self.data = self.data[self.batch_size :]\n yield (\n batch,\n True if len(self.data) == 0 else False,\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler","title":"DataSampler ","text":" Bases: GeneratorStep Step to sample from a dataset. GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples. Attributes: Name Type Description data List[Dict[str, Any]] The list of dictionaries to sample from. size int Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2. samples int Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100. Output columns - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories Examples: Sample data from a list of dictionaries: from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n Pipeline with a loader and a sampler combined in a single stream: from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n# Now we have a single stream of data with the loader and the sampler data\n Source code in src/distilabel/steps/generators/data_sampler.py class DataSampler(GeneratorStep):\n \"\"\"Step to sample from a dataset.\n\n `GeneratorStep` that samples from a dataset and yields it in batches.\n This step is useful when you have a pipeline that can benefit from using examples\n in the prompts for example as few-shot learning, that can be changing on each row.\n For example, you can pass a list of dictionaries with N examples and generate M samples\n from it (assuming you have another step loading data, this M should have the same size\n as the data being loaded in that step). The size S argument is the number of samples per\n row generated, so each example would contain S examples to be used as examples.\n\n Attributes:\n data: The list of dictionaries to sample from.\n size: Number of samples per example. For example in a few-shot learning scenario,\n the number of few-shot examples that will be generated per example. Defaults to 2.\n samples: Number of examples that will be generated by the step in total.\n If used with another loader step, this should be the same as the number\n of samples in the loader step. Defaults to 100.\n\n Output columns:\n - dynamic (based on the keys found on the first dictionary of the list): The columns\n of the dataset.\n\n Categories:\n - load\n\n Examples:\n Sample data from a list of dictionaries:\n\n ```python\n from distilabel.steps import DataSampler\n\n sampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n )\n sampler.load()\n\n result = next(sampler.process())\n # >>> result\n # ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n ```\n\n Pipeline with a loader and a sampler combined in a single stream:\n\n ```python\n from datasets import load_dataset\n\n from distilabel.steps import LoadDataFromDicts, DataSampler\n from distilabel.steps.tasks.apigen.utils import PrepareExamples\n from distilabel.pipeline import Pipeline\n\n ds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n )\n data = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n ]\n with Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n # Now we have a single stream of data with the loader and the sampler data\n ```\n \"\"\"\n\n data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n size: int = Field(\n default=2,\n description=(\n \"Number of samples per example. For example in a few-shot learning scenario, the number \"\n \"of few-shot examples that will be generated per example.\"\n ),\n )\n samples: int = Field(\n default=100,\n description=(\n \"Number of examples that will be generated by the step in total. \"\n \"If used with another loader step, this should be the same as the number of \"\n \"samples in the loader step.\"\n ),\n )\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n\n total_samples = 0\n\n while total_samples < self.samples:\n batch = []\n bs = min(self.batch_size, self.samples - total_samples)\n for _ in range(self.batch_size):\n choices = random.choices(self.data, k=self.size)\n choices = self._transform_data(choices)\n batch.extend(choices)\n total_samples += bs\n batch = list(islice(batch, bs))\n yield (batch, True if total_samples >= self.samples else False)\n batch = []\n\n @staticmethod\n def _transform_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n if not data:\n return []\n\n result = {key: [] for key in data[0].keys()}\n\n for item in data:\n for key, value in item.items():\n result[key].append(value)\n\n return [result]\n\n @property\n def outputs(self) -> List[str]:\n return list(self.data[0].keys())\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler.process","title":"process(offset=0) ","text":"Yields batches from a list of dictionaries. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries as read from the inputs (propagated in batches) GeneratorStepOutput and a flag indicating whether the yield batch is the last one. Source code in src/distilabel/steps/generators/data_sampler.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n\n total_samples = 0\n\n while total_samples < self.samples:\n batch = []\n bs = min(self.batch_size, self.samples - total_samples)\n for _ in range(self.batch_size):\n choices = random.choices(self.data, k=self.size)\n choices = self._transform_data(choices)\n batch.extend(choices)\n total_samples += bs\n batch = list(islice(batch, bs))\n yield (batch, True if total_samples >= self.samples else False)\n batch = []\n "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore","title":"RewardModelScore ","text":" Bases: Step , CudaDevicePlacementMixin Assign a score to a response using a Reward Model. RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers , assigns an score to a response generated for an instruction, or a score to a multi-turn conversation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. revision str if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . torch_dtype str the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . device_map Union[str, Dict[str, Any], None] a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . token Union[SecretStr, None] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . truncation bool whether to truncate sequences at the maximum length. Defaults to False . max_length Union[int, None] maximun length to use for padding or truncation. Defaults to None . Input columns - instruction (
str , optional): the instruction used to generate a response . If provided, then response must be provided too. - response (
str , optional): the response generated for instruction . If provided, then instruction must be provide too. - conversation (
ChatType , optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided. Output columns - score (
float ): the score given by the reward model for the instruction-response pair or the conversation. Categories Examples: Assigning an score for an instruction-response pair: from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n)\n# [\n# {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n# {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n Assigning an score for a multi-turn conversation: from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n)\n# [\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n Source code in src/distilabel/steps/reward_model.py class RewardModelScore(Step, CudaDevicePlacementMixin):\n \"\"\"Assign a score to a response using a Reward Model.\n\n `RewardModelScore` is a `Step` that using a Reward Model (RM) loaded using `transformers`,\n assigns an score to a response generated for an instruction, or a score to a multi-turn\n conversation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n Defaults to `\"auto\"`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n device_map: a dictionary mapping each layer of the model to a device, or a mode like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n truncation: whether to truncate sequences at the maximum length. Defaults to `False`.\n max_length: maximun length to use for padding or truncation. Defaults to `None`.\n\n Input columns:\n - instruction (`str`, optional): the instruction used to generate a `response`.\n If provided, then `response` must be provided too.\n - response (`str`, optional): the response generated for `instruction`. If provided,\n then `instruction` must be provide too.\n - conversation (`ChatType`, optional): a multi-turn conversation. If not provided,\n then `instruction` and `response` columns must be provided.\n\n Output columns:\n - score (`float`): the score given by the reward model for the instruction-response\n pair or the conversation.\n\n Categories:\n - scorer\n\n Examples:\n Assigning an score for an instruction-response pair:\n\n ```python\n from distilabel.steps import RewardModelScore\n\n step = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n )\n\n step.load()\n\n result = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n )\n # [\n # {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n # {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n # ]\n ```\n\n Assigning an score for a multi-turn conversation:\n\n ```python\n from distilabel.steps import RewardModelScore\n\n step = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n )\n\n step.load()\n\n result = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n )\n # [\n # {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n # {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n # ]\n ```\n \"\"\"\n\n model: str\n revision: str = \"main\"\n torch_dtype: str = \"auto\"\n trust_remote_code: bool = False\n device_map: Union[str, Dict[str, Any], None] = None\n token: Union[SecretStr, None] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR), description=\"\"\n )\n truncation: bool = False\n max_length: Union[int, None] = None\n\n _model: Union[\"PreTrainedModel\", None] = PrivateAttr(None)\n _tokenizer: Union[\"PreTrainedTokenizer\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n\n if self.device_map in [\"cuda\", \"auto\"]:\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import AutoModelForSequenceClassification, AutoTokenizer\n except ImportError as e:\n raise ImportError(\n \"`transformers` is not installed. Please install it using `pip install transformers`.\"\n ) from e\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._model = AutoModelForSequenceClassification.from_pretrained(\n self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n device_map=self.device_map,\n token=token,\n )\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n token=token,\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"Either `response` and `instruction`, or a `conversation` columns.\"\"\"\n return {\n \"response\": False,\n \"instruction\": False,\n \"conversation\": False,\n }\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The `score` given by the reward model.\"\"\"\n return [\"score\"]\n\n def _prepare_conversation(self, input: Dict[str, Any]) -> \"ChatType\":\n if \"instruction\" in input and \"response\" in input:\n return [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n\n return input[\"conversation\"]\n\n def _prepare_inputs(self, inputs: List[Dict[str, Any]]) -> \"torch.Tensor\":\n return self._tokenizer.apply_chat_template( # type: ignore\n [self._prepare_conversation(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n truncation=self.truncation,\n max_length=self.max_length,\n ).to(self._model.device) # type: ignore\n\n def _inference(self, inputs: List[Dict[str, Any]]) -> List[float]:\n import torch\n\n input_ids = self._prepare_inputs(inputs)\n with torch.no_grad():\n output = self._model(input_ids) # type: ignore\n logits = output.logits\n if logits.shape == (2, 1):\n logits = logits.squeeze(-1)\n return logits.tolist()\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n scores = self._inference(inputs)\n for input, score in zip(inputs, scores):\n input[\"score\"] = score\n yield inputs\n\n def unload(self) -> None:\n if self.device_map in [\"cuda\", \"auto\"]:\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.inputs","title":"inputs: StepColumns property ","text":"Either response and instruction , or a conversation columns. "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.outputs","title":"outputs: StepColumns property ","text":"The score given by the reward model. "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn","title":"TruncateTextColumn ","text":" Bases: Step Truncate a row using a tokenizer or the number of characters. TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length. Attributes: Name Type Description column str the column to truncate. Defaults to \"text\" . max_length int the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192 . tokenizer Optional[str] the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None . Input columns - dynamic (determined by
column attribute): The columns to be truncated, defaults to \"text\". Output columns - dynamic (determined by
column attribute): The truncated column. Categories Examples: Truncating a row to a given number of tokens: from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a sample'}]\n Truncating a row to a given number of characters: from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a '}]\n Source code in src/distilabel/steps/truncate.py class TruncateTextColumn(Step):\n \"\"\"Truncate a row using a tokenizer or the number of characters.\n\n `TruncateTextColumn` is a `Step` that truncates a row according to the max length. If\n the `tokenizer` is provided, then the row will be truncated using the tokenizer,\n and the `max_length` will be used as the maximum number of tokens, otherwise it will\n be used as the maximum number of characters. The `TruncateTextColumn` step is useful when one\n wants to truncate a row to a certain length, to avoid posterior errors in the model due\n to the length.\n\n Attributes:\n column: the column to truncate. Defaults to `\"text\"`.\n max_length: the maximum length to use for truncation.\n If a `tokenizer` is given, corresponds to the number of tokens,\n otherwise corresponds to the number of characters. Defaults to `8192`.\n tokenizer: the name of the tokenizer to use. If provided, the row will be\n truncated using the tokenizer. Defaults to `None`.\n\n Input columns:\n - dynamic (determined by `column` attribute): The columns to be truncated, defaults to \"text\".\n\n Output columns:\n - dynamic (determined by `column` attribute): The truncated column.\n\n Categories:\n - text-manipulation\n\n Examples:\n Truncating a row to a given number of tokens:\n\n ```python\n from distilabel.steps import TruncateTextColumn\n\n trunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n )\n\n trunc.load()\n\n result = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n )\n # result\n # [{'text': 'This is a sample'}]\n ```\n\n Truncating a row to a given number of characters:\n\n ```python\n from distilabel.steps import TruncateTextColumn\n\n trunc = TruncateTextColumn(max_length=10)\n\n trunc.load()\n\n result = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n )\n # result\n # [{'text': 'This is a '}]\n ```\n \"\"\"\n\n column: str = \"text\"\n max_length: int = 8192\n tokenizer: Optional[str] = None\n _truncator: Optional[Callable[[str], str]] = None\n _tokenizer: Optional[Any] = None\n\n def load(self):\n super().load()\n if self.tokenizer:\n if not importlib.util.find_spec(\"transformers\"):\n raise ImportError(\n \"`transformers` is needed to tokenize, but is not installed. \"\n \"Please install it using `pip install transformers`.\"\n )\n\n from transformers import AutoTokenizer\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)\n self._truncator = self._truncate_with_tokenizer\n else:\n self._truncator = self._truncate_with_length\n\n @property\n def inputs(self) -> List[str]:\n return [self.column]\n\n @property\n def outputs(self) -> List[str]:\n return self.inputs\n\n def _truncate_with_length(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters.\"\"\"\n return text[: self.max_length]\n\n def _truncate_with_tokenizer(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n return self._tokenizer.decode(\n self._tokenizer.encode(\n text,\n add_special_tokens=False,\n max_length=self.max_length,\n truncation=True,\n )\n )\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[self.column] = self._truncator(input[self.column])\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_length","title":"_truncate_with_length(text) ","text":"Truncates the text according to the number of characters. Source code in src/distilabel/steps/truncate.py def _truncate_with_length(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters.\"\"\"\n return text[: self.max_length]\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_tokenizer","title":"_truncate_with_tokenizer(text) ","text":"Truncates the text according to the number of characters using the tokenizer. Source code in src/distilabel/steps/truncate.py def _truncate_with_tokenizer(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n return self._tokenizer.decode(\n self._tokenizer.encode(\n text,\n add_special_tokens=False,\n max_length=self.max_length,\n truncation=True,\n )\n )\n "},{"location":"api/step_gallery/hugging_face/","title":"Hugging Face","text":"This section contains the existing steps integrated with Hugging Face so as to easily push the generated datasets to Hugging Face. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk","title":"LoadDataFromDisk ","text":" Bases: LoadDataFromHub Load a dataset that was previously saved to disk. If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class. Attributes: Name Type Description dataset_path RuntimeParameter[Union[str, Path]] The path to the dataset or distiset. split Optional[RuntimeParameter[str]] The split of the dataset to load (typically will be train , test or validation ). config Optional[RuntimeParameter[str]] The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. Runtime parameters batch_size : The batch size to use when processing the data. dataset_path : The path to the dataset or distiset. is_distiset : Whether the dataset to load is a Distiset or not. Defaults to False. split : The split of the dataset to load. Defaults to 'train'. config : The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a Hugging Face Dataset: from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data from a distilabel Distiset: from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n Load data from a Hugging Face Dataset or Distiset in your cloud provider: from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromDisk(LoadDataFromHub):\n \"\"\"Load a dataset that was previously saved to disk.\n\n If you previously saved your dataset using the `save_to_disk` method, or\n `Distiset.save_to_disk` you can load it again to build a new pipeline using this class.\n\n Attributes:\n dataset_path: The path to the dataset or distiset.\n split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n config: The configuration of the dataset to load. Defaults to `default`, if there are\n multiple configurations in the dataset this must be suplied or an error is raised.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `dataset_path`: The path to the dataset or distiset.\n - `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `config`: The configuration of the dataset to load. Defaults to `default`, if there are\n multiple configurations in the dataset this must be suplied or an error is raised.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a Hugging Face Dataset:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n loader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data from a distilabel Distiset:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n # Specify the configuration to load.\n loader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n ```\n\n Load data from a Hugging Face Dataset or Distiset in your cloud provider:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n loader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n \"\"\"\n\n dataset_path: RuntimeParameter[Union[str, Path]] = Field(\n default=None,\n description=\"Path to the dataset or distiset.\",\n )\n config: Optional[RuntimeParameter[str]] = Field(\n default=\"default\",\n description=(\n \"The configuration of the dataset to load. Will default to 'default'\",\n \" which corresponds to a distiset with a single configuration.\",\n ),\n )\n is_distiset: Optional[RuntimeParameter[bool]] = Field(\n default=False,\n description=\"Whether the dataset to load is a `Distiset` or not. Defaults to False.\",\n )\n keep_in_memory: Optional[RuntimeParameter[bool]] = Field(\n default=None,\n description=\"Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` \"\n \" for more information. Defaults to `None`.\",\n )\n split: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The split of the dataset to load. By default will load the whole Dataset/Distiset.\",\n )\n repo_id: ExcludedField[Union[str, None]] = None\n\n def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n super(GeneratorStep, self).load()\n if self.is_distiset:\n ds = Distiset.load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n if self.config not in ds.keys():\n raise DistilabelUserError(\n f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n \"of the available configurations.\\n\\n\",\n page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n )\n ds = ds[self.config]\n\n else:\n ds = load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n\n if self.split:\n ds = ds[self.split]\n\n self._dataset = ds\n\n if self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n else:\n self.num_examples = len(self._dataset)\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets from a file\n in disk.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n if self._dataset is None:\n self.load()\n\n return self._dataset.column_names\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets from a file in disk. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.load","title":"load() ","text":"Load the dataset from the file/s in disk. Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n super(GeneratorStep, self).load()\n if self.is_distiset:\n ds = Distiset.load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n if self.config not in ds.keys():\n raise DistilabelUserError(\n f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n \"of the available configurations.\\n\\n\",\n page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n )\n ds = ds[self.config]\n\n else:\n ds = load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n\n if self.split:\n ds = ds[self.split]\n\n self._dataset = ds\n\n if self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n else:\n self.num_examples = len(self._dataset)\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem","title":"LoadDataFromFileSystem ","text":" Bases: LoadDataFromHub Loads a dataset from a file in your filesystem. GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types. Attributes: Name Type Description data_files RuntimeParameter[Union[str, Path]] The path to the file, or directory containing the files that conform the dataset. split RuntimeParameter[Union[str, Path]] The split of the dataset to load (typically will be train , test or validation ). Runtime parameters batch_size : The batch size to use when processing the data. data_files : The path to the file, or directory containing the files that conform the dataset. split : The split of the dataset to load. Defaults to 'train'. streaming : Whether to load the dataset in streaming mode or not. Defaults to False . num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . filetype : The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file. Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a Hugging Face dataset in your file system: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Specify a filetype if the file extension is not expected: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data from a file in your cloud provider: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data passing a glob pattern: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromFileSystem(LoadDataFromHub):\n \"\"\"Loads a dataset from a file in your filesystem.\n\n `GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`\n library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)\n for more information of the supported file types.\n\n Attributes:\n data_files: The path to the file, or directory containing the files that conform\n the dataset.\n split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `data_files`: The path to the file, or directory containing the files that conform\n the dataset.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n `False`.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n - `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.\n For more than one file, it will be inferred from the first file.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a Hugging Face dataset in your file system:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Specify a filetype if the file extension is not expected:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data from a file in your cloud provider:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data passing a glob pattern:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n \"\"\"\n\n data_files: RuntimeParameter[Union[str, Path]] = Field(\n default=None,\n description=\"The data files, or directory containing the data files, to generate the dataset from.\",\n )\n filetype: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The expected filetype. If not provided, it will be inferred from the file extension.\",\n )\n repo_id: ExcludedField[Union[str, None]] = None\n\n def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n GeneratorStep.load(self)\n\n data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n self._dataset = load_dataset(\n self.filetype,\n data_files=data_files,\n split=self.split,\n streaming=self.streaming,\n storage_options=self.storage_options,\n )\n\n if not self.streaming and self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n if not self.num_examples:\n if self.streaming:\n # There's no better way to get the number of examples in a streaming dataset,\n # load it again for the moment.\n self.num_examples = len(\n load_dataset(\n self.filetype, data_files=self.data_files, split=self.split\n )\n )\n else:\n self.num_examples = len(self._dataset)\n\n @staticmethod\n def _prepare_data_files( # noqa: C901\n data_path: UPath,\n ) -> Tuple[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], str]:\n \"\"\"Prepare the loading process by setting the `data_files` attribute.\n\n Args:\n data_path: The path to the data files, or directory containing the data files.\n\n Returns:\n Tuple with the data files and the filetype.\n \"\"\"\n\n def get_filetype(data_path: UPath) -> str:\n filetype = data_path.suffix.lstrip(\".\")\n if filetype == \"jsonl\":\n filetype = \"json\"\n return filetype\n\n if data_path.is_file() or (\n len(str(data_path.parent.glob(data_path.name))) >= 1\n ):\n filetype = get_filetype(data_path)\n data_files = str(data_path)\n\n elif data_path.is_dir():\n file_sequence = []\n file_map = defaultdict(list)\n for file_or_folder in data_path.iterdir():\n if file_or_folder.is_file():\n file_sequence.append(str(file_or_folder))\n elif file_or_folder.is_dir():\n for file in file_or_folder.iterdir():\n file_sequence.append(str(file))\n file_map[str(file_or_folder)].append(str(file))\n\n data_files = file_sequence or file_map\n # Try to obtain the filetype from any of the files, assuming all files have the same type.\n if file_sequence:\n filetype = get_filetype(UPath(file_sequence[0]))\n else:\n filetype = get_filetype(UPath(file_map[list(file_map.keys())[0]][0]))\n return data_files, filetype\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets from a file\n in disk.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n if self._dataset is None:\n self.load()\n\n return self._dataset.column_names\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets from a file in disk. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.load","title":"load() ","text":"Load the dataset from the file/s in disk. Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n GeneratorStep.load(self)\n\n data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n self._dataset = load_dataset(\n self.filetype,\n data_files=data_files,\n split=self.split,\n streaming=self.streaming,\n storage_options=self.storage_options,\n )\n\n if not self.streaming and self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n if not self.num_examples:\n if self.streaming:\n # There's no better way to get the number of examples in a streaming dataset,\n # load it again for the moment.\n self.num_examples = len(\n load_dataset(\n self.filetype, data_files=self.data_files, split=self.split\n )\n )\n else:\n self.num_examples = len(self._dataset)\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub","title":"LoadDataFromHub ","text":" Bases: GeneratorStep Loads a dataset from the Hugging Face Hub. GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library. Attributes: Name Type Description repo_id RuntimeParameter[str] The Hugging Face Hub repository ID of the dataset to load. split RuntimeParameter[str] The split of the dataset to load. config Optional[RuntimeParameter[str]] The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. Runtime parameters batch_size : The batch size to use when processing the data. repo_id : The Hugging Face Hub repository ID of the dataset to load. split : The split of the dataset to load. Defaults to 'train'. config : The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. revision : The revision of the dataset to load. Defaults to the latest revision. streaming : Whether to load the dataset in streaming mode or not. Defaults to False . num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a dataset in Hugging Face Hub: from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromHub(GeneratorStep):\n \"\"\"Loads a dataset from the Hugging Face Hub.\n\n `GeneratorStep` that loads a dataset from the Hugging Face Hub using the `datasets`\n library.\n\n Attributes:\n repo_id: The Hugging Face Hub repository ID of the dataset to load.\n split: The split of the dataset to load.\n config: The configuration of the dataset to load. This is optional and only needed\n if the dataset has multiple configurations.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `repo_id`: The Hugging Face Hub repository ID of the dataset to load.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `config`: The configuration of the dataset to load. This is optional and only\n needed if the dataset has multiple configurations.\n - `revision`: The revision of the dataset to load. Defaults to the latest revision.\n - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n `False`.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a dataset in Hugging Face Hub:\n\n ```python\n from distilabel.steps import LoadDataFromHub\n\n loader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'prompt': 'Arianna has 12...', False)\n ```\n \"\"\"\n\n repo_id: RuntimeParameter[str] = Field(\n default=None,\n description=\"The Hugging Face Hub repository ID of the dataset to load.\",\n )\n split: RuntimeParameter[str] = Field(\n default=\"train\",\n description=\"The split of the dataset to load. Defaults to 'train'.\",\n )\n config: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The configuration of the dataset to load. This is optional and only\"\n \" needed if the dataset has multiple configurations.\",\n )\n revision: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The revision of the dataset to load. Defaults to the latest revision.\",\n )\n streaming: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to load the dataset in streaming mode or not. Defaults to False.\",\n )\n num_examples: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"The number of examples to load from the dataset. By default will load all examples.\",\n )\n storage_options: Optional[Dict[str, Any]] = Field(\n default=None,\n description=\"The storage options to use when loading the dataset.\",\n )\n\n _dataset: Union[IterableDataset, Dataset, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n super().load()\n\n if self._dataset is not None:\n # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n return\n\n self._dataset = load_dataset(\n self.repo_id, # type: ignore\n self.config,\n split=self.split,\n revision=self.revision,\n streaming=self.streaming,\n )\n num_examples = self._get_dataset_num_examples()\n self.num_examples = (\n min(self.num_examples, num_examples) if self.num_examples else num_examples\n )\n\n if not self.streaming:\n self._dataset = self._dataset.select(range(self.num_examples))\n\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n Args:\n offset: The offset to start yielding the data from. Will be used during the caching\n process to help skipping already processed data.\n\n Yields:\n A tuple containing a batch of rows and a boolean indicating if the batch is\n the last one.\n \"\"\"\n num_returned_rows = 0\n for batch_num, batch in enumerate(\n self._dataset.iter(batch_size=self.batch_size) # type: ignore\n ):\n if batch_num * self.batch_size < offset:\n continue\n transformed_batch = self._transform_batch(batch)\n batch_size = len(transformed_batch)\n num_returned_rows += batch_size\n yield transformed_batch, num_returned_rows >= self.num_examples\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets loaded\n from the Hugging Face Hub.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n return self._get_dataset_columns()\n\n def _transform_batch(self, batch: Dict[str, Any]) -> List[Dict[str, Any]]:\n \"\"\"Transform a batch of data from the Hugging Face Hub into a list of rows.\n\n Args:\n batch: The batch of data from the Hugging Face Hub.\n\n Returns:\n A list of rows, where each row is a dictionary of column names and values.\n \"\"\"\n length = len(next(iter(batch.values())))\n rows = []\n for i in range(length):\n rows.append({col: values[i] for col, values in batch.items()})\n return rows\n\n def _get_dataset_num_examples(self) -> int:\n \"\"\"Get the number of examples in the dataset, based on the `split` and `config`\n runtime parameters provided.\n\n Returns:\n The number of examples in the dataset.\n \"\"\"\n default_config = self.config\n if not default_config:\n default_config = list(self._dataset_info.keys())[0]\n\n return self._dataset_info[default_config].splits[self.split].num_examples\n\n def _get_dataset_columns(self) -> List[str]:\n \"\"\"Get the columns of the dataset, based on the `config` runtime parameter provided.\n\n Returns:\n The columns of the dataset.\n \"\"\"\n return list(\n self._dataset_info[\n self.config if self.config else \"default\"\n ].features.keys()\n )\n\n @cached_property\n def _dataset_info(self) -> Dict[str, DatasetInfo]:\n \"\"\"Calls the Datasets Server API from Hugging Face to obtain the dataset information.\n\n Returns:\n The dataset information.\n \"\"\"\n\n try:\n return get_dataset_infos(self.repo_id)\n except Exception as e:\n warnings.warn(\n f\"Failed to get dataset info from Hugging Face Hub, trying to get it loading the dataset. Error: {e}\",\n UserWarning,\n stacklevel=2,\n )\n ds = load_dataset(self.repo_id, config=self.config, split=self.split)\n if self.config:\n return ds[self.config].info\n return ds.info\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.load","title":"load() ","text":"Load the dataset from the Hugging Face Hub Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n super().load()\n\n if self._dataset is not None:\n # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n return\n\n self._dataset = load_dataset(\n self.repo_id, # type: ignore\n self.config,\n split=self.split,\n revision=self.revision,\n streaming=self.streaming,\n )\n num_examples = self._get_dataset_num_examples()\n self.num_examples = (\n min(self.num_examples, num_examples) if self.num_examples else num_examples\n )\n\n if not self.streaming:\n self._dataset = self._dataset.select(range(self.num_examples))\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.process","title":"process(offset=0) ","text":"Yields batches from the loaded dataset from the Hugging Face Hub. Parameters: Name Type Description Default offset int The offset to start yielding the data from. Will be used during the caching process to help skipping already processed data. 0 Yields: Type Description GeneratorStepOutput A tuple containing a batch of rows and a boolean indicating if the batch is GeneratorStepOutput the last one. Source code in src/distilabel/steps/generators/huggingface.py def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n Args:\n offset: The offset to start yielding the data from. Will be used during the caching\n process to help skipping already processed data.\n\n Yields:\n A tuple containing a batch of rows and a boolean indicating if the batch is\n the last one.\n \"\"\"\n num_returned_rows = 0\n for batch_num, batch in enumerate(\n self._dataset.iter(batch_size=self.batch_size) # type: ignore\n ):\n if batch_num * self.batch_size < offset:\n continue\n transformed_batch = self._transform_batch(batch)\n batch_size = len(transformed_batch)\n num_returned_rows += batch_size\n yield transformed_batch, num_returned_rows >= self.num_examples\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub","title":"PushToHub ","text":" Bases: GlobalStep Push data to a Hugging Face Hub dataset. A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub. Attributes: Name Type Description repo_id RuntimeParameter[str] The Hugging Face Hub repository ID where the dataset will be uploaded. split RuntimeParameter[str] The split of the dataset that will be pushed. Defaults to \"train\" . private RuntimeParameter[bool] Whether the dataset to be pushed should be private or not. Defaults to False . token Optional[RuntimeParameter[str]] The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN . If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None . Runtime parameters repo_id : The Hugging Face Hub repository ID where the dataset will be uploaded. split : The split of the dataset that will be pushed. private : Whether the dataset to be pushed should be private or not. token : The token that will be used to authenticate in the Hub. Input columns - dynamic (
all ): all columns from the input will be used to create the dataset. Categories Examples: Push batches of your dataset to the Hugging Face Hub repository: from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n Source code in src/distilabel/steps/globals/huggingface.py class PushToHub(GlobalStep):\n \"\"\"Push data to a Hugging Face Hub dataset.\n\n A `GlobalStep` which creates a `datasets.Dataset` with the input data and pushes\n it to the Hugging Face Hub.\n\n Attributes:\n repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.\n split: The split of the dataset that will be pushed. Defaults to `\"train\"`.\n private: Whether the dataset to be pushed should be private or not. Defaults to\n `False`.\n token: The token that will be used to authenticate in the Hub. If not provided, the\n token will be tried to be obtained from the environment variable `HF_TOKEN`.\n If not provided using one of the previous methods, then `huggingface_hub` library\n will try to use the token from the local Hugging Face CLI configuration. Defaults\n to `None`.\n\n Runtime parameters:\n - `repo_id`: The Hugging Face Hub repository ID where the dataset will be uploaded.\n - `split`: The split of the dataset that will be pushed.\n - `private`: Whether the dataset to be pushed should be private or not.\n - `token`: The token that will be used to authenticate in the Hub.\n\n Input columns:\n - dynamic (`all`): all columns from the input will be used to create the dataset.\n\n Categories:\n - save\n - dataset\n - huggingface\n\n Examples:\n Push batches of your dataset to the Hugging Face Hub repository:\n\n ```python\n from distilabel.steps import PushToHub\n\n push = PushToHub(repo_id=\"path_to/repo\")\n push.load()\n\n result = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction ', 'generation': 'generation'}]\n ```\n \"\"\"\n\n repo_id: RuntimeParameter[str] = Field(\n default=None,\n description=\"The Hugging Face Hub repository ID where the dataset will be uploaded.\",\n )\n split: RuntimeParameter[str] = Field(\n default=\"train\",\n description=\"The split of the dataset that will be pushed. Defaults to 'train'.\",\n )\n private: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether the dataset to be pushed should be private or not. Defaults\"\n \" to `False`.\",\n )\n token: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The token that will be used to authenticate in the Hub. If not provided,\"\n \" the token will be tried to be obtained from the environment variable `HF_TOKEN`.\"\n \" If not provided using one of the previous methods, then `huggingface_hub` library\"\n \" will try to use the token from the local Hugging Face CLI configuration. Defaults\"\n \" to `None`\",\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n Args:\n inputs: that input data within a single object (as it's a GlobalStep) that\n will be transformed into a `datasets.Dataset`.\n\n Yields:\n Propagates the received inputs so that the `Distiset` can be generated if this is\n the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n steps.\n \"\"\"\n dataset_dict = defaultdict(list)\n for input in inputs:\n for key, value in input.items():\n dataset_dict[key].append(value)\n dataset_dict = dict(dataset_dict)\n dataset = Dataset.from_dict(dataset_dict)\n dataset.push_to_hub(\n self.repo_id, # type: ignore\n split=self.split,\n private=self.private,\n token=self.token or os.getenv(\"HF_TOKEN\"),\n )\n yield inputs\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub.process","title":"process(inputs) ","text":"Method that processes the input data, respecting the datasets.Dataset formatting, and pushes it to the Hugging Face Hub based on the RuntimeParameter s attributes. Parameters: Name Type Description Default inputs StepInput that input data within a single object (as it's a GlobalStep) that will be transformed into a datasets.Dataset . required Yields: Type Description StepOutput Propagates the received inputs so that the Distiset can be generated if this is StepOutput the last step of the Pipeline , or if this is not a leaf step and has follow up StepOutput steps. Source code in src/distilabel/steps/globals/huggingface.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n Args:\n inputs: that input data within a single object (as it's a GlobalStep) that\n will be transformed into a `datasets.Dataset`.\n\n Yields:\n Propagates the received inputs so that the `Distiset` can be generated if this is\n the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n steps.\n \"\"\"\n dataset_dict = defaultdict(list)\n for input in inputs:\n for key, value in input.items():\n dataset_dict[key].append(value)\n dataset_dict = dict(dataset_dict)\n dataset = Dataset.from_dict(dataset_dict)\n dataset.push_to_hub(\n self.repo_id, # type: ignore\n split=self.split,\n private=self.private,\n token=self.token or os.getenv(\"HF_TOKEN\"),\n )\n yield inputs\n "},{"location":"api/task/","title":"Task","text":"This section contains the API reference for the distilabel tasks. For more information on how the Task works and see some examples, check the Tutorial - Task page. "},{"location":"api/task/#distilabel.steps.tasks.base","title":"base ","text":""},{"location":"api/task/#distilabel.steps.tasks.base._Task","title":"_Task ","text":" Bases: _Step , ABC _Task is an abstract class that implements the _Step interface and adds the format_input and format_output methods to format the inputs and outputs of the task. It also adds a llm attribute to be used as the LLM to generate the outputs. Attributes: Name Type Description llm LLM the LLM to be used to generate the outputs of the task. group_generations bool whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . add_raw_output RuntimeParameter[bool] whether to include a field with the raw output of the LLM in the distilabel_metadata field of the output. Can be helpful to not loose data with Tasks that need to format the output of the LLM . Defaults to False . num_generations RuntimeParameter[int] The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class _Task(_Step, ABC):\n \"\"\"_Task is an abstract class that implements the `_Step` interface and adds the\n `format_input` and `format_output` methods to format the inputs and outputs of the\n task. It also adds a `llm` attribute to be used as the LLM to generate the outputs.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n add_raw_output: whether to include a field with the raw output of the LLM in the\n `distilabel_metadata` field of the output. Can be helpful to not loose data\n with `Tasks` that need to format the output of the `LLM`. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n llm: LLM\n\n group_generations: bool = False\n add_raw_output: RuntimeParameter[bool] = Field(\n default=True,\n description=(\n \"Whether to include the raw output of the LLM in the key `raw_output_<TASK_NAME>`\"\n \" of the `distilabel_metadata` dictionary output column\"\n ),\n )\n add_raw_input: RuntimeParameter[bool] = Field(\n default=True,\n description=(\n \"Whether to include the raw input of the LLM in the key `raw_input_<TASK_NAME>`\"\n \" of the `distilabel_metadata` dictionary column\"\n ),\n )\n num_generations: RuntimeParameter[int] = Field(\n default=1, description=\"The number of generations to be produced per input.\"\n )\n use_default_structured_output: bool = False\n\n _can_be_used_with_offline_batch_generation: bool = PrivateAttr(False)\n\n def model_post_init(self, __context: Any) -> None:\n if (\n self.llm.use_offline_batch_generation\n and not self._can_be_used_with_offline_batch_generation\n ):\n raise DistilabelUserError(\n f\"`{self.__class__.__name__}` task cannot be used with offline batch generation\"\n \" feature.\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation\",\n )\n\n super().model_post_init(__context)\n\n @property\n def is_global(self) -> bool:\n \"\"\"Extends the `is_global` property to return `True` if the task is using the\n offline batch generation feature, otherwise it returns the value of the parent\n class property. `offline_batch_generation` requires to receive all the inputs\n at once, so for the `_BatchManager` this is a global step.\n\n Returns:\n Whether the task is a global step or not.\n \"\"\"\n if self.llm.use_offline_batch_generation:\n return True\n\n return super().is_global\n\n def load(self) -> None:\n \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n super().load()\n self._set_default_structured_output()\n self.llm.load()\n\n @override\n def unload(self) -> None:\n \"\"\"Unloads the LLM.\"\"\"\n self._logger.debug(\"Executing task unload logic.\")\n self.llm.unload()\n\n @override\n def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n ) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the outputs of the task in case the LLM failed to generate a response.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n data = self._create_metadata(\n data,\n None,\n None,\n add_raw_output=self.add_raw_output,\n add_raw_input=self.add_raw_input,\n )\n result.append(data)\n return result\n\n @abstractmethod\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n as a string, and generates a Python dictionary with the outputs of the task. In\n addition the `input` used to generate the output is also received just in case it's\n needed to be able to parse the output correctly.\n \"\"\"\n pass\n\n def _format_outputs(\n self,\n outputs: \"GenerateOutput\",\n input: Union[Dict[str, Any], None] = None,\n ) -> List[Dict[str, Any]]:\n \"\"\"Formats the outputs of the task using the `format_output` method. If the output\n is `None` (i.e. the LLM failed to generate a response), then the outputs will be\n set to `None` as well.\n\n Args:\n outputs: The outputs (`n` generations) for the provided `input`.\n input: The input used to generate the output.\n\n Returns:\n A list containing a dictionary with the outputs of the task for each input.\n \"\"\"\n inputs = [None] if input is None else [input]\n formatted_outputs = []\n repeate_inputs = len(outputs.get(\"generations\"))\n outputs = normalize_statistics(outputs)\n\n for (output, stats, extra), input in zip(\n iterate_generations_with_stats(outputs), inputs * repeate_inputs\n ): # type: ignore\n try:\n # Extract the generations, and move the statistics to the distilabel_metadata,\n # to keep everything clean\n formatted_output = self.format_output(output, input)\n formatted_output = self._create_metadata(\n output=formatted_output,\n raw_output=output,\n input=input,\n add_raw_output=self.add_raw_output, # type: ignore\n add_raw_input=self.add_raw_input, # type: ignore\n statistics=stats,\n )\n formatted_output = self._create_extra(\n output=formatted_output, extra=extra\n )\n formatted_outputs.append(formatted_output)\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"Task '{self.name}' failed to format output: {e}. Saving raw response.\" # type: ignore\n )\n formatted_outputs.append(self._output_on_failure(output, input))\n return formatted_outputs\n\n def _output_on_failure(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"In case of failure to format the output, this method will return a dictionary including\n a new field `distilabel_meta` with the raw output of the LLM.\n \"\"\"\n # Create a dictionary with the outputs of the task (every output set to None)\n outputs = {output: None for output in self.outputs}\n outputs[\"model_name\"] = self.llm.model_name # type: ignore\n outputs = self._create_metadata(\n outputs,\n output,\n input,\n add_raw_output=self.add_raw_output, # type: ignore\n add_raw_input=self.add_raw_input, # type: ignore\n )\n return outputs\n\n def _create_metadata(\n self,\n output: Dict[str, Any],\n raw_output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n add_raw_output: bool = True,\n add_raw_input: bool = True,\n statistics: Optional[\"LLMStatistics\"] = None,\n ) -> Dict[str, Any]:\n \"\"\"Adds the raw output and or the formatted input of the LLM to the output dictionary\n if `add_raw_output` is True or `add_raw_input` is True.\n\n Args:\n output:\n The output dictionary after formatting the output from the LLM,\n to add the raw output and or raw input.\n raw_output: The raw output of the `LLM`.\n input: The input used to generate the output.\n add_raw_output: Whether to add the raw output to the output dictionary.\n add_raw_input: Whether to add the raw input to the output dictionary.\n statistics: The statistics generated by the LLM, which should contain at least\n the number of input and output tokens.\n \"\"\"\n meta = output.get(DISTILABEL_METADATA_KEY, {})\n\n if add_raw_output:\n meta[f\"raw_output_{self.name}\"] = raw_output\n\n if add_raw_input:\n meta[f\"raw_input_{self.name}\"] = self.format_input(input) if input else None\n\n if statistics:\n meta[f\"statistics_{self.name}\"] = statistics\n\n if meta:\n output[DISTILABEL_METADATA_KEY] = meta\n\n return output\n\n def _create_extra(\n self, output: Dict[str, Any], extra: Dict[str, Any]\n ) -> Dict[str, Any]:\n column_name_prefix = f\"llm_{self.name}_\"\n for key, value in extra.items():\n column_name = column_name_prefix + key\n output[column_name] = value\n return output\n\n def _set_default_structured_output(self) -> None:\n \"\"\"Prepares the structured output to be set in the selected `LLM`.\n\n If the method `get_structured_output` returns None (the default), there's no need\n to set anything, as it doesn't apply.\n If the `use_default_structured_output` and there's no previous structured output\n set by hand, then decide the type of structured output to select depending on the\n `LLM` provider.\n \"\"\"\n schema = self.get_structured_output()\n if not schema:\n return\n\n if self.use_default_structured_output and not self.llm.structured_output:\n # In case the default structured output is required, we have to set it before\n # the LLM is loaded\n from distilabel.models.llms import InferenceEndpointsLLM\n from distilabel.models.llms.base import AsyncLLM\n\n def check_dependency(module_name: str) -> None:\n if not importlib.util.find_spec(module_name):\n raise ImportError(\n f\"`{module_name}` is not installed and is needed for the structured generation with this LLM.\"\n f\" Please install it using `pip install {module_name}`.\"\n )\n\n dependency = \"outlines\"\n structured_output = {\"schema\": schema}\n if isinstance(self.llm, InferenceEndpointsLLM):\n structured_output.update({\"format\": \"json\"})\n # To determine instructor or outlines format\n elif isinstance(self.llm, AsyncLLM) and not isinstance(\n self.llm, InferenceEndpointsLLM\n ):\n dependency = \"instructor\"\n structured_output.update({\"format\": \"json\"})\n\n check_dependency(dependency)\n self.llm.structured_output = structured_output\n\n def get_structured_output(self) -> Union[Dict[str, Any], None]:\n \"\"\"Returns the structured output for a task that implements one by default,\n must be overriden by subclasses of `Task`. When implemented, should be a json\n schema that enforces the response from the LLM so that it's easier to parse.\n \"\"\"\n return None\n\n def _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {input: f\"<PLACEHOLDER_{input.upper()}>\" for input in self.inputs}\n )\n\n def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n \"\"\"Prints a sample input to the console using the `rich` library.\n Helper method to visualize the prompt of the task.\n\n Args:\n sample_input: A sample input to be printed. If not provided, a default will be\n generated using the `_sample_input` method, which can be overriden by\n subclasses. This should correspond to the same example you could pass to\n the `format_input` method.\n The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n Examples:\n Print the URIAL prompt:\n\n ```python\n from distilabel.steps.tasks import URIAL\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n urial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n )\n urial.load()\n urial.print()\n \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n \u2502 \u2502 # Instruction \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n \u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n \u2502 \u2502 have an engaging tone. \u2502 \u2502\n \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n \u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n \u2502 \u2502 controversial topics. \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # User: \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # Assistant: \u2502 \u2502\n \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n ```\n \"\"\"\n from rich.console import Console, Group\n from rich.panel import Panel\n from rich.text import Text\n\n console = Console()\n sample_input = sample_input or self._sample_input()\n\n panels = []\n for item in sample_input:\n content = Text.assemble((item.get(\"content\", \"\"),))\n panel = Panel(\n content,\n title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n border_style=\"light_cyan3\",\n )\n panels.append(panel)\n\n # Create a group of panels\n # Wrap the group in an outer panel\n outer_panel = Panel(\n Group(*panels),\n title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n border_style=\"light_cyan3\",\n expand=False,\n )\n console.print(outer_panel)\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.is_global","title":"is_global: bool property ","text":"Extends the is_global property to return True if the task is using the offline batch generation feature, otherwise it returns the value of the parent class property. offline_batch_generation requires to receive all the inputs at once, so for the _BatchManager this is a global step. Returns: Type Description bool Whether the task is a global step or not. "},{"location":"api/task/#distilabel.steps.tasks.base._Task.load","title":"load() ","text":"Loads the LLM via the LLM.load() method. Source code in src/distilabel/steps/tasks/base.py def load(self) -> None:\n \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n super().load()\n self._set_default_structured_output()\n self.llm.load()\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.unload","title":"unload() ","text":"Unloads the LLM. Source code in src/distilabel/steps/tasks/base.py @override\ndef unload(self) -> None:\n \"\"\"Unloads the LLM.\"\"\"\n self._logger.debug(\"Executing task unload logic.\")\n self.llm.unload()\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.impute_step_outputs","title":"impute_step_outputs(step_output) ","text":"Imputes the outputs of the task in case the LLM failed to generate a response. Source code in src/distilabel/steps/tasks/base.py @override\ndef impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the outputs of the task in case the LLM failed to generate a response.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n data = self._create_metadata(\n data,\n None,\n None,\n add_raw_output=self.add_raw_output,\n add_raw_input=self.add_raw_input,\n )\n result.append(data)\n return result\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.format_output","title":"format_output(output, input=None) abstractmethod ","text":"Abstract method to format the outputs of the task. It needs to receive an output as a string, and generates a Python dictionary with the outputs of the task. In addition the input used to generate the output is also received just in case it's needed to be able to parse the output correctly. Source code in src/distilabel/steps/tasks/base.py @abstractmethod\ndef format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n as a string, and generates a Python dictionary with the outputs of the task. In\n addition the `input` used to generate the output is also received just in case it's\n needed to be able to parse the output correctly.\n \"\"\"\n pass\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.get_structured_output","title":"get_structured_output() ","text":"Returns the structured output for a task that implements one by default, must be overriden by subclasses of Task . When implemented, should be a json schema that enforces the response from the LLM so that it's easier to parse. Source code in src/distilabel/steps/tasks/base.py def get_structured_output(self) -> Union[Dict[str, Any], None]:\n \"\"\"Returns the structured output for a task that implements one by default,\n must be overriden by subclasses of `Task`. When implemented, should be a json\n schema that enforces the response from the LLM so that it's easier to parse.\n \"\"\"\n return None\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.print","title":"print(sample_input=None) ","text":"Prints a sample input to the console using the rich library. Helper method to visualize the prompt of the task. Parameters: Name Type Description Default sample_input Optional[ChatType] A sample input to be printed. If not provided, a default will be generated using the _sample_input method, which can be overriden by subclasses. This should correspond to the same example you could pass to the format_input method. The variables be named by default. None Examples: Print the URIAL prompt: from distilabel.steps.tasks import URIAL\nfrom distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nurial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n)\nurial.load()\nurial.print()\n\u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n\u2502 \u2502 # Instruction \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n\u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n\u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n\u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n\u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n\u2502 \u2502 have an engaging tone. \u2502 \u2502\n\u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n\u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n\u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n\u2502 \u2502 controversial topics. \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 # User: \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 # Assistant: \u2502 \u2502\n\u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n Source code in src/distilabel/steps/tasks/base.py def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n \"\"\"Prints a sample input to the console using the `rich` library.\n Helper method to visualize the prompt of the task.\n\n Args:\n sample_input: A sample input to be printed. If not provided, a default will be\n generated using the `_sample_input` method, which can be overriden by\n subclasses. This should correspond to the same example you could pass to\n the `format_input` method.\n The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n Examples:\n Print the URIAL prompt:\n\n ```python\n from distilabel.steps.tasks import URIAL\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n urial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n )\n urial.load()\n urial.print()\n \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n \u2502 \u2502 # Instruction \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n \u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n \u2502 \u2502 have an engaging tone. \u2502 \u2502\n \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n \u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n \u2502 \u2502 controversial topics. \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # User: \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # Assistant: \u2502 \u2502\n \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n ```\n \"\"\"\n from rich.console import Console, Group\n from rich.panel import Panel\n from rich.text import Text\n\n console = Console()\n sample_input = sample_input or self._sample_input()\n\n panels = []\n for item in sample_input:\n content = Text.assemble((item.get(\"content\", \"\"),))\n panel = Panel(\n content,\n title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n border_style=\"light_cyan3\",\n )\n panels.append(panel)\n\n # Create a group of panels\n # Wrap the group in an outer panel\n outer_panel = Panel(\n Group(*panels),\n title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n border_style=\"light_cyan3\",\n expand=False,\n )\n console.print(outer_panel)\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task","title":"Task ","text":" Bases: _Task , Step Task is a class that implements the _Task abstract class and adds the Step interface to be used as a step in the pipeline. Attributes: Name Type Description llm the LLM to be used to generate the outputs of the task. group_generations whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . num_generations The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class Task(_Task, Step):\n \"\"\"Task is a class that implements the `_Task` abstract class and adds the `Step`\n interface to be used as a step in the pipeline.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n @abstractmethod\n def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n pass\n\n def _format_inputs(self, inputs: List[Dict[str, Any]]) -> List[\"FormattedInput\"]:\n \"\"\"Formats the inputs of the task using the `format_input` method.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list containing the formatted inputs, which are `ChatType`-like following\n the OpenAI formatting.\n \"\"\"\n return [self.format_input(input) for input in inputs]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n formatted_inputs = self._format_inputs(inputs)\n\n # `outputs` is a dict containing the LLM outputs in the `generations`\n # key and the statistics in the `statistics` key\n outputs = self.llm.generate_outputs(\n inputs=formatted_inputs,\n num_generations=self.num_generations, # type: ignore\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n task_outputs = []\n for input, input_outputs in zip(inputs, outputs):\n formatted_outputs = self._format_outputs(input_outputs, input)\n\n if self.group_generations:\n combined = group_dicts(*formatted_outputs)\n task_outputs.append(\n {**input, **combined, \"model_name\": self.llm.model_name}\n )\n continue\n\n # Create a row per generation\n for formatted_output in formatted_outputs:\n task_outputs.append(\n {**input, **formatted_output, \"model_name\": self.llm.model_name}\n )\n\n yield task_outputs\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task.format_input","title":"format_input(input) abstractmethod ","text":"Abstract method to format the inputs of the task. It needs to receive an input as a Python dictionary, and generates an OpenAI chat-like list of dicts. Source code in src/distilabel/steps/tasks/base.py @abstractmethod\ndef format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n pass\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/base.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n formatted_inputs = self._format_inputs(inputs)\n\n # `outputs` is a dict containing the LLM outputs in the `generations`\n # key and the statistics in the `statistics` key\n outputs = self.llm.generate_outputs(\n inputs=formatted_inputs,\n num_generations=self.num_generations, # type: ignore\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n task_outputs = []\n for input, input_outputs in zip(inputs, outputs):\n formatted_outputs = self._format_outputs(input_outputs, input)\n\n if self.group_generations:\n combined = group_dicts(*formatted_outputs)\n task_outputs.append(\n {**input, **combined, \"model_name\": self.llm.model_name}\n )\n continue\n\n # Create a row per generation\n for formatted_output in formatted_outputs:\n task_outputs.append(\n {**input, **formatted_output, \"model_name\": self.llm.model_name}\n )\n\n yield task_outputs\n "},{"location":"api/task/generator_task/","title":"GeneratorTask","text":"This section contains the API reference for the distilabel generator tasks. For more information on how the GeneratorTask works and see some examples, check the Tutorial - Task - GeneratorTask page. "},{"location":"api/task/generator_task/#distilabel.steps.tasks.base.GeneratorTask","title":"GeneratorTask ","text":" Bases: _Task , GeneratorStep GeneratorTask is a class that implements the _Task abstract class and adds the GeneratorStep interface to be used as a step in the pipeline. Attributes: Name Type Description llm the LLM to be used to generate the outputs of the task. group_generations whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . num_generations The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class GeneratorTask(_Task, GeneratorStep):\n \"\"\"`GeneratorTask` is a class that implements the `_Task` abstract class and adds the\n `GeneratorStep` interface to be used as a step in the pipeline.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n pass\n "},{"location":"api/task/task_gallery/","title":"Task Gallery","text":"This section contains the existing Task subclasses implemented in distilabel . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks","title":"tasks ","text":""},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker","title":"APIGenExecutionChecker ","text":" Bases: Step Executes the generated function calls. This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath , which is a string pointing to a python .py file with functions). Attributes: Name Type Description libpath str The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename. check_is_dangerous bool Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True. Input columns - answers (
str ): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads . Output columns - keep_row_after_execution_check (
bool ): Whether the function should be kept or not. - execution_result (
str ): The result from executing the function. Categories References - APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
- Salesforce/xlam-function-calling-60k
Examples: Execute a function from a given library with the answer from an LLM: from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n Source code in src/distilabel/steps/tasks/apigen/execution_checker.py class APIGenExecutionChecker(Step):\n \"\"\"Executes the generated function calls.\n\n This step checks if a given answer from a model as generated by `APIGenGenerator`\n can be executed against the given library (given by `libpath`, which is a string\n pointing to a python .py file with functions).\n\n Attributes:\n libpath: The path to the library where we will retrieve the functions.\n It can also point to a folder with the functions. In this case, the folder\n layout should be a folder with .py files, each containing a single function,\n the name of the function being the same as the filename.\n check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains\n some heuristics found while testing. This functions can run subprocesses, deal with\n the OS, or have other potentially dangerous operations. Defaults to True.\n\n Input columns:\n - answers (`str`): List with arguments to be passed to the function,\n dumped as a string from a list of dictionaries. Should be loaded using\n `json.loads`.\n\n Output columns:\n - keep_row_after_execution_check (`bool`): Whether the function should be kept or not.\n - execution_result (`str`): The result from executing the function.\n\n Categories:\n - filtering\n - execution\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n Execute a function from a given library with the answer from an LLM:\n\n ```python\n from distilabel.steps.tasks import APIGenExecutionChecker\n\n # For the libpath you can use as an example the file at the tests folder:\n # ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\n task = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n )\n task.load()\n\n res = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n )\n res\n #[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n ```\n \"\"\"\n\n libpath: str = Field(\n default=...,\n description=(\n \"The path to the library where we will retrieve the functions, \"\n \"or a folder with python files named the same as the functions they contain.\",\n ),\n )\n check_is_dangerous: bool = Field(\n default=True,\n description=(\n \"Bool to exclude some potentially dangerous functions, it contains \"\n \"some heuristics found while testing. This functions can run subprocesses, \"\n \"deal with the OS, or have other potentially dangerous operations.\",\n ),\n )\n\n _toolbox: Union[\"ModuleType\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n super().load()\n if Path(self.libpath).suffix == \".py\":\n self._toolbox = load_module_from_path(self.libpath)\n\n def unload(self) -> None:\n self._toolbox = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are those found in the original dataset.\"\"\"\n return [\"answers\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs are the columns required by `APIGenGenerator` task.\"\"\"\n return [\"keep_row_after_execution_check\", \"execution_result\"]\n\n def _get_function(self, function_name: str) -> Callable:\n \"\"\"Retrieves the function from the toolbox.\n\n Args:\n function_name: The name of the function to retrieve.\n\n Returns:\n Callable: The function to be executed.\n \"\"\"\n if self._toolbox:\n return getattr(self._toolbox, function_name, None)\n try:\n toolbox = load_module_from_path(\n str(Path(self.libpath) / f\"{function_name}.py\")\n )\n return getattr(toolbox, function_name, None)\n except FileNotFoundError:\n return None\n except Exception as e:\n self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n return None\n\n def _is_dangerous(self, function: Callable) -> bool:\n \"\"\"Checks if a function is dangerous to remove it.\n Contains a list of heuristics to avoid executing possibly dangerous functions.\n \"\"\"\n source_code = inspect.getsource(function)\n # We don't want to execute functions that use subprocess\n if (\n (\"subprocess.\" in source_code)\n or (\"os.system(\" in source_code)\n or (\"input(\" in source_code)\n # Avoiding threading\n or (\"threading.Thread(\" in source_code)\n or (\"exec(\" in source_code)\n # Avoiding argparse (not sure why)\n or (\"argparse.ArgumentParser(\" in source_code)\n # Avoiding logging changing the levels to not mess with the logs\n or (\".setLevel(\" in source_code)\n # Don't run a test battery\n or (\"unittest.main(\" in source_code)\n # Avoid exiting the program\n or (\"sys.exit(\" in source_code)\n or (\"exit(\" in source_code)\n or (\"raise SystemExit(\" in source_code)\n or (\"multiprocessing.Pool(\" in source_code)\n ):\n return True\n return False\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Checks the answer to see if it can be executed.\n Captures the possible errors and returns them.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n output = []\n if input[\"answers\"]:\n answers = json.loads(input[\"answers\"])\n else:\n input.update(\n **{\n \"keep_row_after_execution_check\": False,\n \"execution_result\": [\"No answers were provided.\"],\n }\n )\n continue\n for answer in answers:\n if answer is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": \"Nothing was generated for this answer.\",\n }\n )\n continue\n\n function_name = answer.get(\"name\", None)\n arguments = answer.get(\"arguments\", None)\n\n self._logger.debug(\n f\"Executing function '{function_name}' with arguments: {arguments}\"\n )\n function = self._get_function(function_name)\n\n if self.check_is_dangerous:\n if function and self._is_dangerous(function):\n function = None\n\n if function is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": f\"Function '{function_name}' not found.\",\n }\n )\n else:\n execution = execute_from_response(function, arguments)\n output.append(\n {\n \"keep\": execution[\"keep\"],\n \"execution_result\": execution[\"execution_result\"],\n }\n )\n # We only consider a good response if all the answers were executed successfully,\n # but keep the reasons for further review if needed.\n input.update(\n **{\n \"keep_row_after_execution_check\": all(\n o[\"keep\"] is True for o in output\n ),\n \"execution_result\": [o[\"execution_result\"] for o in output],\n }\n )\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are those found in the original dataset. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.outputs","title":"outputs: StepColumns property ","text":"The outputs are the columns required by APIGenGenerator task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.load","title":"load() ","text":"Loads the library where the functions will be extracted from. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def load(self) -> None:\n \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n super().load()\n if Path(self.libpath).suffix == \".py\":\n self._toolbox = load_module_from_path(self.libpath)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._get_function","title":"_get_function(function_name) ","text":"Retrieves the function from the toolbox. Parameters: Name Type Description Default function_name str The name of the function to retrieve. required Returns: Name Type Description Callable Callable The function to be executed. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def _get_function(self, function_name: str) -> Callable:\n \"\"\"Retrieves the function from the toolbox.\n\n Args:\n function_name: The name of the function to retrieve.\n\n Returns:\n Callable: The function to be executed.\n \"\"\"\n if self._toolbox:\n return getattr(self._toolbox, function_name, None)\n try:\n toolbox = load_module_from_path(\n str(Path(self.libpath) / f\"{function_name}.py\")\n )\n return getattr(toolbox, function_name, None)\n except FileNotFoundError:\n return None\n except Exception as e:\n self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n return None\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._is_dangerous","title":"_is_dangerous(function) ","text":"Checks if a function is dangerous to remove it. Contains a list of heuristics to avoid executing possibly dangerous functions. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def _is_dangerous(self, function: Callable) -> bool:\n \"\"\"Checks if a function is dangerous to remove it.\n Contains a list of heuristics to avoid executing possibly dangerous functions.\n \"\"\"\n source_code = inspect.getsource(function)\n # We don't want to execute functions that use subprocess\n if (\n (\"subprocess.\" in source_code)\n or (\"os.system(\" in source_code)\n or (\"input(\" in source_code)\n # Avoiding threading\n or (\"threading.Thread(\" in source_code)\n or (\"exec(\" in source_code)\n # Avoiding argparse (not sure why)\n or (\"argparse.ArgumentParser(\" in source_code)\n # Avoiding logging changing the levels to not mess with the logs\n or (\".setLevel(\" in source_code)\n # Don't run a test battery\n or (\"unittest.main(\" in source_code)\n # Avoid exiting the program\n or (\"sys.exit(\" in source_code)\n or (\"exit(\" in source_code)\n or (\"raise SystemExit(\" in source_code)\n or (\"multiprocessing.Pool(\" in source_code)\n ):\n return True\n return False\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.process","title":"process(inputs) ","text":"Checks the answer to see if it can be executed. Captures the possible errors and returns them. If a single example is provided, it is copied to avoid raising an error. Parameters: Name Type Description Default inputs StepInput A list of dictionaries with the input data. required Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Checks the answer to see if it can be executed.\n Captures the possible errors and returns them.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n output = []\n if input[\"answers\"]:\n answers = json.loads(input[\"answers\"])\n else:\n input.update(\n **{\n \"keep_row_after_execution_check\": False,\n \"execution_result\": [\"No answers were provided.\"],\n }\n )\n continue\n for answer in answers:\n if answer is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": \"Nothing was generated for this answer.\",\n }\n )\n continue\n\n function_name = answer.get(\"name\", None)\n arguments = answer.get(\"arguments\", None)\n\n self._logger.debug(\n f\"Executing function '{function_name}' with arguments: {arguments}\"\n )\n function = self._get_function(function_name)\n\n if self.check_is_dangerous:\n if function and self._is_dangerous(function):\n function = None\n\n if function is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": f\"Function '{function_name}' not found.\",\n }\n )\n else:\n execution = execute_from_response(function, arguments)\n output.append(\n {\n \"keep\": execution[\"keep\"],\n \"execution_result\": execution[\"execution_result\"],\n }\n )\n # We only consider a good response if all the answers were executed successfully,\n # but keep the reasons for further review if needed.\n input.update(\n **{\n \"keep_row_after_execution_check\": all(\n o[\"keep\"] is True for o in output\n ),\n \"execution_result\": [o[\"execution_result\"] for o in output],\n }\n )\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator","title":"APIGenGenerator ","text":" Bases: Task Generate queries and answers for the given functions in JSON format. The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\nverifiable and diverse function-calling datasets. The task generates a set of diverse queries\nand corresponding answers for the given functions in JSON format.\n\nAttributes:\n system_prompt: The system prompt to guide the user in the generation of queries and answers.\n use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n In case the tools are given in the input, they will be added to the prompt.\n number: The number of queries to generate. It can be a list, where each number will be\n chosen randomly, or a dictionary with the number of queries and the probability of each.\n I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n It corresponds to the number of parallel queries to generate.\n use_default_structured_output: Whether to use the default structured output or not.\n\nInput columns:\n - examples (`str`): Examples used as few shots to guide the model.\n - func_name (`str`): Name for the function to generate.\n - func_desc (`str`): Description of what the function should do.\n - tools (`str`): JSON formatted string containing the tool representation of the function.\n\nOutput columns:\n - query (`str`): The list of queries.\n - answers (`str`): JSON formatted string with the list of answers, containing the info as\n a dictionary to be passed to the functions.\n\nCategories:\n - text-generation\n\nReferences:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\nExamples:\n Generate without structured output (original implementation):\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n )\n apigen.load()\n\n res = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\n What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?', # 'Give me 5 random movie suggestions from your database to plan my weekend.'], # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}], # [{'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format. Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date. Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}] Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API. The output MUST strictly adhere to the following JSON format, and NO other text MUST be included: [\n {\n \"query\": \"The generated query.\",\n \"answers\": [\n {\n \"name\": \"api_name\",\n \"arguments\": {\n \"arg_name\": \"value\"\n ... (more arguments as required)\n }\n },\n ... (more API calls as required)\n ]\n }\n]\n Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` Generate with structured output:\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n )\n apigen.load()\n\n res_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\n What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res_struct # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\", # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"], # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}], # [{'arguments': {}, 'name': 'getrandommovie'}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format. Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date. Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}] Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API. Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` Source code in src/distilabel/steps/tasks/apigen/generator.py class APIGenGenerator(Task):\n \"\"\"Generate queries and answers for the given functions in JSON format.\n\n The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n and corresponding answers for the given functions in JSON format.\n\n Attributes:\n system_prompt: The system prompt to guide the user in the generation of queries and answers.\n use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n In case the tools are given in the input, they will be added to the prompt.\n number: The number of queries to generate. It can be a list, where each number will be\n chosen randomly, or a dictionary with the number of queries and the probability of each.\n I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n It corresponds to the number of parallel queries to generate.\n use_default_structured_output: Whether to use the default structured output or not.\n\n Input columns:\n - examples (`str`): Examples used as few shots to guide the model.\n - func_name (`str`): Name for the function to generate.\n - func_desc (`str`): Description of what the function should do.\n - tools (`str`): JSON formatted string containing the tool representation of the function.\n\n Output columns:\n - query (`str`): The list of queries.\n - answers (`str`): JSON formatted string with the list of answers, containing the info as\n a dictionary to be passed to the functions.\n\n Categories:\n - text-generation\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n Generate without structured output (original implementation):\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n )\n apigen.load()\n\n res = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n )\n res\n # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n # 'number': 1,\n # 'func_name': 'getrandommovie',\n # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n # 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n # [{'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}}]],\n # 'raw_input_api_gen_generator_0': [{'role': 'system',\n # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n # {'role': 'user',\n # 'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n {\\n \"query\": \"The generated query.\",\\n \"answers\": [\\n {\\n \"name\": \"api_name\",\\n \"arguments\": {\\n \"arg_name\": \"value\"\\n ... (more arguments as required)\\n }\\n },\\n ... (more API calls as required)\\n ]\\n }\\n]\\n```\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Generate with structured output:\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n )\n apigen.load()\n\n res_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n )\n res_struct\n # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n # 'number': 1,\n # 'func_name': 'getrandommovie',\n # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n # [{'arguments': {}, 'name': 'getrandommovie'}]],\n # 'raw_input_api_gen_generator_0': [{'role': 'system',\n # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n # {'role': 'user',\n # 'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT_API_GEN\n use_default_structured_output: bool = False\n number: Union[int, List[int], Dict[int, float]] = 1\n use_tools: bool = True\n\n _number: Union[int, None] = PrivateAttr(None)\n _fn_parallel_queries: Union[Callable[[], str], None] = PrivateAttr(None)\n _format_inst: Union[str, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"generator.jinja2\"\n )\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n\n def _parallel_queries(self, number: int) -> Callable[[int], str]:\n \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n Raises:\n ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n Returns:\n The function to generate the parallel queries guide.\n \"\"\"\n if number > 1:\n return (\n \"It can contain multiple parallel queries in natural language for the given functions. \"\n \"They could use either the same function with different arguments or different functions.\\n\"\n )\n return \"\"\n\n def _get_number(self) -> int:\n \"\"\"Generates the number of queries to generate in a single call.\n The number must be set to `_number` to avoid changing the original value\n when calling `_default_error`.\n \"\"\"\n if isinstance(self.number, list):\n self._number = random.choice(self.number)\n elif isinstance(self.number, dict):\n self._number = random.choices(\n list(self.number.keys()), list(self.number.values())\n )[0]\n else:\n self._number = self.number\n return self._number\n\n def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n \"```\\n\"\n \"[\\n\"\n \" {\\n\"\n ' \"query\": \"The generated query.\",\\n'\n ' \"answers\": [\\n'\n \" {\\n\"\n ' \"name\": \"api_name\",\\n'\n ' \"arguments\": {\\n'\n ' \"arg_name\": \"value\"\\n'\n \" ... (more arguments as required)\\n\"\n \" }\\n\"\n \" },\\n\"\n \" ... (more API calls as required)\\n\"\n \" ]\\n\"\n \" }\\n\"\n \"]\\n\"\n \"```\\n\"\n )\n\n def _get_func_desc(self, input: Dict[str, Any]) -> str:\n \"\"\"If available and required, will use the info from the tools in the\n prompt for extra information. Otherwise will use jut the function description.\n \"\"\"\n if not self.use_tools:\n return input[\"func_desc\"]\n extra = \"\" # Extra information from the tools (if available will be added)\n if \"tools\" in input:\n extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n return input[\"func_desc\"] + extra\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task.\"\"\"\n return {\n \"examples\": True,\n \"func_name\": True,\n \"func_desc\": True,\n \"tools\": False,\n }\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n number = self._get_number()\n parallel_queries = self._parallel_queries(number)\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n examples=input[\"examples\"],\n parallel_queries=parallel_queries,\n number=number,\n func_name=input[\"func_name\"],\n func_desc=self._get_func_desc(input),\n format_inst=self._format_inst,\n ),\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n return [\"query\", \"answers\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n if not self.use_default_structured_output:\n output = remove_fences(output)\n\n try:\n pairs = orjson.loads(output)\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n return self._format_output(pairs, input)\n\n def _format_output(\n self, pairs: Dict[str, Any], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n Args:\n pairs: The parsed dictionary from the LLM's output.\n input: The input from the `LLM`.\n\n Returns:\n Formatted output, where the `queries` are a list of strings, and the `answers`\n are a list of objects.\n \"\"\"\n try:\n input.update(\n **{\n \"query\": pairs[0][\"query\"],\n \"answers\": json.dumps(pairs[0][\"answers\"]),\n }\n )\n return input\n except Exception as e:\n self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n return self._default_error(input)\n\n def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n input.update(\n **{\n \"query\": None,\n \"answers\": json.dumps([None] * self._number),\n }\n )\n return input\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Dict, List\n from pydantic import BaseModel\n\n\n class Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\n class QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\n class QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\n json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Answer\": {\n \"properties\": {\n \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n \"arguments\": {\n \"additionalProperties\": {\"type\": \"string\"},\n \"title\": \"Arguments\",\n \"type\": \"object\",\n },\n },\n \"required\": [\"name\", \"arguments\"],\n \"title\": \"Answer\",\n \"type\": \"object\",\n },\n \"QueryAnswer\": {\n \"properties\": {\n \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n \"answers\": {\n \"items\": {\"$ref\": \"#/$defs/Answer\"},\n \"title\": \"Answers\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"query\", \"answers\"],\n \"title\": \"QueryAnswer\",\n \"type\": \"object\",\n },\n },\n \"properties\": {\n \"pairs\": {\n \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n \"title\": \"Pairs\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"pairs\"],\n \"title\": \"QueryAnswerPairs\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.outputs","title":"outputs: StepColumns property ","text":"The output for the task are the queries and corresponding answers. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.load","title":"load() ","text":"Loads the template for the generator prompt. Source code in src/distilabel/steps/tasks/apigen/generator.py def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"generator.jinja2\"\n )\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._parallel_queries","title":"_parallel_queries(number) ","text":"Prepares the function to update the parallel queries guide in the prompt. Raises: Type Description ValueError if is_parallel is not a boolean or a list of floats. Returns: Type Description Callable[[int], str] The function to generate the parallel queries guide. Source code in src/distilabel/steps/tasks/apigen/generator.py def _parallel_queries(self, number: int) -> Callable[[int], str]:\n \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n Raises:\n ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n Returns:\n The function to generate the parallel queries guide.\n \"\"\"\n if number > 1:\n return (\n \"It can contain multiple parallel queries in natural language for the given functions. \"\n \"They could use either the same function with different arguments or different functions.\\n\"\n )\n return \"\"\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_number","title":"_get_number() ","text":"Generates the number of queries to generate in a single call. The number must be set to _number to avoid changing the original value when calling _default_error . Source code in src/distilabel/steps/tasks/apigen/generator.py def _get_number(self) -> int:\n \"\"\"Generates the number of queries to generate in a single call.\n The number must be set to `_number` to avoid changing the original value\n when calling `_default_error`.\n \"\"\"\n if isinstance(self.number, list):\n self._number = random.choice(self.number)\n elif isinstance(self.number, dict):\n self._number = random.choices(\n list(self.number.keys()), list(self.number.values())\n )[0]\n else:\n self._number = self.number\n return self._number\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._set_format_inst","title":"_set_format_inst() ","text":"Prepares the function to generate the formatted instructions for the prompt. If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON. Source code in src/distilabel/steps/tasks/apigen/generator.py def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n \"```\\n\"\n \"[\\n\"\n \" {\\n\"\n ' \"query\": \"The generated query.\",\\n'\n ' \"answers\": [\\n'\n \" {\\n\"\n ' \"name\": \"api_name\",\\n'\n ' \"arguments\": {\\n'\n ' \"arg_name\": \"value\"\\n'\n \" ... (more arguments as required)\\n\"\n \" }\\n\"\n \" },\\n\"\n \" ... (more API calls as required)\\n\"\n \" ]\\n\"\n \" }\\n\"\n \"]\\n\"\n \"```\\n\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_func_desc","title":"_get_func_desc(input) ","text":"If available and required, will use the info from the tools in the prompt for extra information. Otherwise will use jut the function description. Source code in src/distilabel/steps/tasks/apigen/generator.py def _get_func_desc(self, input: Dict[str, Any]) -> str:\n \"\"\"If available and required, will use the info from the tools in the\n prompt for extra information. Otherwise will use jut the function description.\n \"\"\"\n if not self.use_tools:\n return input[\"func_desc\"]\n extra = \"\" # Extra information from the tools (if available will be added)\n if \"tools\" in input:\n extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n return input[\"func_desc\"] + extra\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType . Source code in src/distilabel/steps/tasks/apigen/generator.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n number = self._get_number()\n parallel_queries = self._parallel_queries(number)\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n examples=input[\"examples\"],\n parallel_queries=parallel_queries,\n number=number,\n func_name=input[\"func_name\"],\n func_desc=self._get_func_desc(input),\n format_inst=self._format_inst,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the queries and answers pairs. Dict[str, Any] The answers are an array of answers corresponding to the query. Dict[str, Any] Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer. Dict[str, Any] Each argument is represented as a key-value pair, where the key is the parameter name and the Dict[str, Any] value is the corresponding value. Source code in src/distilabel/steps/tasks/apigen/generator.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n if not self.use_default_structured_output:\n output = remove_fences(output)\n\n try:\n pairs = orjson.loads(output)\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n return self._format_output(pairs, input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._format_output","title":"_format_output(pairs, input) ","text":"Parses the response, returning a dictionary with queries and answers. Parameters: Name Type Description Default pairs Dict[str, Any] The parsed dictionary from the LLM's output. required input Dict[str, Any] The input from the LLM . required Returns: Type Description Dict[str, Any] Formatted output, where the queries are a list of strings, and the answers Dict[str, Any] are a list of objects. Source code in src/distilabel/steps/tasks/apigen/generator.py def _format_output(\n self, pairs: Dict[str, Any], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n Args:\n pairs: The parsed dictionary from the LLM's output.\n input: The input from the `LLM`.\n\n Returns:\n Formatted output, where the `queries` are a list of strings, and the `answers`\n are a list of objects.\n \"\"\"\n try:\n input.update(\n **{\n \"query\": pairs[0][\"query\"],\n \"answers\": json.dumps(pairs[0][\"answers\"]),\n }\n )\n return input\n except Exception as e:\n self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n return self._default_error(input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._default_error","title":"_default_error(input) ","text":"Returns a default error output, to fill the responses in case of failure. Source code in src/distilabel/steps/tasks/apigen/generator.py def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n input.update(\n **{\n \"query\": None,\n \"answers\": json.dumps([None] * self._number),\n }\n )\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from typing import Dict, List\nfrom pydantic import BaseModel\n\n\nclass Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\nclass QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\nclass QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\njson.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/apigen/generator.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Dict, List\n from pydantic import BaseModel\n\n\n class Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\n class QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\n class QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\n json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Answer\": {\n \"properties\": {\n \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n \"arguments\": {\n \"additionalProperties\": {\"type\": \"string\"},\n \"title\": \"Arguments\",\n \"type\": \"object\",\n },\n },\n \"required\": [\"name\", \"arguments\"],\n \"title\": \"Answer\",\n \"type\": \"object\",\n },\n \"QueryAnswer\": {\n \"properties\": {\n \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n \"answers\": {\n \"items\": {\"$ref\": \"#/$defs/Answer\"},\n \"title\": \"Answers\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"query\", \"answers\"],\n \"title\": \"QueryAnswer\",\n \"type\": \"object\",\n },\n },\n \"properties\": {\n \"pairs\": {\n \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n \"title\": \"Pairs\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"pairs\"],\n \"title\": \"QueryAnswerPairs\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker","title":"APIGenSemanticChecker ","text":" Bases: Task Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. Attributes: Name Type Description system_prompt str System prompt for the task. Has a default one. exclude_failed_execution str Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker ). Defaults to True. Input columns - func_desc (
str ): Description of what the function should do. - query (
str ): Instruction from the user. - answers (
str ): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads . - execution_result (
str ): Result of the function/API executed. Output columns - thought (
str ): Reasoning for the output on whether to keep this output or not. - keep_row_after_semantic_check (
bool ): True or False, can be used to filter afterwards. Categories - filtering
- text-generation
References - APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
- Salesforce/xlam-function-calling-60k
Examples: Semantic checker for generated function calls (original implementation):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n\nSemantic checker for generated function calls (structured output):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py class APIGenSemanticChecker(Task):\n r\"\"\"Generate queries and answers for the given functions in JSON format.\n\n The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n and corresponding answers for the given functions in JSON format.\n\n Attributes:\n system_prompt: System prompt for the task. Has a default one.\n exclude_failed_execution: Whether to exclude failed executions (won't run on those\n rows that have a False in `keep_row_after_execution_check` column, which\n comes from running `APIGenExecutionChecker`). Defaults to True.\n\n Input columns:\n - func_desc (`str`): Description of what the function should do.\n - query (`str`): Instruction from the user.\n - answers (`str`): JSON encoded list with arguments to be passed to the function/API.\n Should be loaded using `json.loads`.\n - execution_result (`str`): Result of the function/API executed.\n\n Output columns:\n - thought (`str`): Reasoning for the output on whether to keep this output or not.\n - keep_row_after_semantic_check (`bool`): True or False, can be used to filter\n afterwards.\n\n Categories:\n - filtering\n - text-generation\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n\n Semantic checker for generated function calls (original implementation):\n\n ```python\n from distilabel.steps.tasks import APIGenSemanticChecker\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n semantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n )\n semantic_checker.load()\n\n res = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n )\n res\n # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n # 'thought': '',\n # 'keep_row_after_semantic_check': True,\n # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n # 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n # {'role': 'user',\n # 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Semantic checker for generated function calls (structured output):\n\n ```python\n from distilabel.steps.tasks import APIGenSemanticChecker\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n semantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n )\n semantic_checker.load()\n\n res = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n )\n res\n # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n # 'keep_row_after_semantic_check': True,\n # 'thought': '',\n # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n # 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n # {'role': 'user',\n # 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT_SEMANTIC_CHECKER\n use_default_structured_output: bool = False\n\n _format_inst: Union[str, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"semantic_checker.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n\n def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n \"```\\n\"\n \"{\\n\"\n ' \"thought\": \"Concisely describe your reasoning here\",\\n'\n ' \"passes\": \"yes\" or \"no\"\\n'\n \"}\\n\"\n \"```\\n\"\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task.\"\"\"\n return {\n \"func_desc\": True,\n \"query\": True,\n \"answers\": True,\n \"execution_result\": True,\n \"keep_row_after_execution_check\": True,\n }\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n func_desc=input[\"func_desc\"],\n query=input[\"query\"] or \"\",\n func_call=input[\"answers\"] or \"\",\n execution_result=input[\"execution_result\"],\n format_inst=self._format_inst,\n ),\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n return [\"keep_row_after_semantic_check\", \"thought\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n output = remove_fences(output)\n\n try:\n result = orjson.loads(output)\n # Update the column name and change to bool\n result[\"keep_row_after_semantic_check\"] = (\n result.pop(\"passes\").lower() == \"yes\"\n )\n input.update(**result)\n return input\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Default error message for the task.\"\"\"\n input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n return input\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Literal\n from pydantic import BaseModel\n import json\n\n class Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\n json.dumps(Checker.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n },\n \"required\": [\"thought\", \"passes\"],\n \"title\": \"Checker\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.outputs","title":"outputs: StepColumns property ","text":"The output for the task are the queries and corresponding answers. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.load","title":"load() ","text":"Loads the template for the generator prompt. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"semantic_checker.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._set_format_inst","title":"_set_format_inst() ","text":"Prepares the function to generate the formatted instructions for the prompt. If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n \"```\\n\"\n \"{\\n\"\n ' \"thought\": \"Concisely describe your reasoning here\",\\n'\n ' \"passes\": \"yes\" or \"no\"\\n'\n \"}\\n\"\n \"```\\n\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType . Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n func_desc=input[\"func_desc\"],\n query=input[\"query\"] or \"\",\n func_call=input[\"answers\"] or \"\",\n execution_result=input[\"execution_result\"],\n format_inst=self._format_inst,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the queries and answers pairs. Dict[str, Any] The answers are an array of answers corresponding to the query. Dict[str, Any] Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer. Dict[str, Any] Each argument is represented as a key-value pair, where the key is the parameter name and the Dict[str, Any] value is the corresponding value. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n output = remove_fences(output)\n\n try:\n result = orjson.loads(output)\n # Update the column name and change to bool\n result[\"keep_row_after_semantic_check\"] = (\n result.pop(\"passes\").lower() == \"yes\"\n )\n input.update(**result)\n return input\n except orjson.JSONDecodeError:\n return self._default_error(input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._default_error","title":"_default_error(input) ","text":"Default error message for the task. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Default error message for the task.\"\"\"\n input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from typing import Literal\nfrom pydantic import BaseModel\nimport json\n\nclass Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\njson.dumps(Checker.model_json_schema(), indent=4)\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Literal\n from pydantic import BaseModel\n import json\n\n class Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\n json.dumps(Checker.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n },\n \"required\": [\"thought\", \"passes\"],\n \"title\": \"Checker\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller","title":"ArgillaLabeller ","text":" Bases: Task Annotate Argilla records based on input fields, example records and question settings. This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - record (
argilla.Record ): The record to be annotated. - fields (
Optional[List[Dict[str, Any]]] ): The list of field settings for the input fields. - question (
Optional[Dict[str, Any]] ): The question settings for the question to be answered. - example_records (
Optional[List[Dict[str, Any]]] ): The few shot example records with responses to be used to answer the question. - guidelines (
Optional[str] ): The guidelines for the annotation task. Output columns - suggestion (
Dict[str, Any] ): The final suggestion for annotation. Categories - text-classification
- scorer
- text-generation
References Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets Examples: Annotate a record with the same dataset and question: import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n)\nexample_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n Annotate a record with alternating datasets and questions: import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n Overwrite default prompts and instructions: import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n)\nlabeller.load()\n Source code in src/distilabel/steps/tasks/argilla_labeller.py class ArgillaLabeller(Task):\n \"\"\"\n Annotate Argilla records based on input fields, example records and question settings.\n\n This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM.\n It uses a system prompt that guides the LLM to understand the input fields, the question type,\n and the question settings. The task then formats the input data and generates a response based on the question.\n The response is validated against the question's value model, and the final suggestion is prepared for annotation.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - record (`argilla.Record`): The record to be annotated.\n - fields (`Optional[List[Dict[str, Any]]]`): The list of field settings for the input fields.\n - question (`Optional[Dict[str, Any]]`): The question settings for the question to be answered.\n - example_records (`Optional[List[Dict[str, Any]]]`): The few shot example records with responses to be used to answer the question.\n - guidelines (`Optional[str]`): The guidelines for the annotation task.\n\n Output columns:\n - suggestion (`Dict[str, Any]`): The final suggestion for annotation.\n\n Categories:\n - text-classification\n - scorer\n - text-generation\n\n References:\n - [`Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets`](https://github.com/argilla-io/argilla/)\n\n Examples:\n Annotate a record with the same dataset and question:\n\n ```python\n import argilla as rg\n from argilla import Suggestion\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Get information from Argilla dataset definition\n dataset = rg.Dataset(\"my_dataset\")\n pending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\n completed_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\n pending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n )\n example_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n )\n field = dataset.settings.fields[\"text\"]\n question = dataset.settings.questions[\"label\"]\n\n # Initialize the labeller with the model and fields\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n )\n labeller.load()\n\n # Process the pending records\n result = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n )\n\n # Add the suggestions to the records\n for record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n # Log the updated records\n dataset.records.log(pending_records)\n ```\n\n Annotate a record with alternating datasets and questions:\n\n ```python\n import argilla as rg\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Get information from Argilla dataset definition\n dataset = rg.Dataset(\"my_dataset\")\n field = dataset.settings.fields[\"text\"]\n question = dataset.settings.questions[\"label\"]\n question2 = dataset.settings.questions[\"label2\"]\n\n # Initialize the labeller with the model and fields\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n labeller.load()\n\n # Process the record\n record = next(dataset.records())\n result = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n )\n\n # Add the suggestions to the record\n for suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n # Log the updated record\n dataset.records.log([record])\n ```\n\n Overwrite default prompts and instructions:\n\n ```python\n import argilla as rg\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Overwrite default prompts and instructions\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n )\n labeller.load()\n ```\n \"\"\"\n\n system_prompt: str = (\n \"You are an expert annotator and labelling assistant that understands complex domains and natural language processing. \"\n \"You are given input fields and a question. \"\n \"You should create a valid JSON object as an response to the question based on the input fields. \"\n )\n question_to_label_instruction: Dict[str, str] = {\n \"label_selection\": \"Select the appropriate label for the fields from the list of optional labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels for the fields from the list of optional labels.\",\n \"text\": \"Provide a response to the question based on the fields.\",\n \"rating\": \"Provide a rating for the question based on the fields.\",\n }\n example_records: Optional[\n RuntimeParameter[Union[List[Union[Dict[str, Any], BaseModel]], None]]\n ] = Field(\n default=None,\n description=\"The few shot serialized example records or `BaseModel`s with responses to be used to answer the question.\",\n )\n fields: Optional[\n RuntimeParameter[Union[List[Union[BaseModel, Dict[str, Any]]], None]]\n ] = Field(\n default=None,\n description=\"The field serialized field settings or `BaseModel` for the fields to be used to answer the question.\",\n )\n question: Optional[\n RuntimeParameter[\n Union[\n Dict[str, Any],\n BaseModel,\n None,\n ]\n ]\n ] = Field(\n default=None,\n description=\"The question serialized question settings or `BaseModel` for the question to be answered.\",\n )\n guidelines: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The guidelines for the annotation task.\",\n )\n\n _template: Union[Template, None] = PrivateAttr(...)\n _client: Optional[Any] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"argillalabeller.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> Dict[str, bool]:\n return {\n \"record\": True,\n \"fields\": False,\n \"question\": False,\n \"example_records\": False,\n \"guidelines\": False,\n }\n\n def _format_record(\n self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n ) -> str:\n \"\"\"Format the record fields into a string.\n\n Args:\n record (Dict[str, Any]): The record to format.\n fields (List[Dict[str, Any]]): The fields to format.\n\n Returns:\n str: The formatted record fields.\n \"\"\"\n output = []\n for field in fields:\n output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n return \"fields: \" + \"\\n\".join(output)\n\n def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n \"\"\"Get the label instruction for the question.\n\n Args:\n question (Dict[str, Any]): The question to get the label instruction for.\n\n Returns:\n str: The label instruction for the question.\n \"\"\"\n question_type = question[\"settings\"][\"type\"]\n return self.question_to_label_instruction[question_type]\n\n def _format_question(self, question: Dict[str, Any]) -> str:\n \"\"\"Format the question settings into a string.\n\n Args:\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted question.\n \"\"\"\n output = []\n output.append(f\"question: {self._get_label_instruction(question)}\")\n if \"options\" in question.get(\"settings\", {}):\n output.append(\n f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n )\n return \"\\n\".join(output)\n\n def _format_example_records(\n self,\n records: List[Dict[str, Any]],\n fields: List[Dict[str, Any]],\n question: Dict[str, Any],\n ) -> str:\n \"\"\"Format the example records into a string.\n\n Args:\n records (List[Dict[str, Any]]): The records to format.\n fields (List[Dict[str, Any]]): The fields to format.\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted example records.\n \"\"\"\n base = []\n for record in records:\n responses = record.get(\"responses\", {})\n if responses.get(question[\"name\"]):\n base.append(self._format_record(record, fields))\n value = responses[question[\"name\"]][0][\"value\"]\n formatted_value = self._assign_value_to_question_value_model(\n value, question\n )\n base.append(f\"response: {formatted_value}\")\n base.append(\"\")\n else:\n warnings.warn(\n f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n stacklevel=2,\n )\n return \"\\n\".join(base)\n\n def format_input(\n self,\n input: Dict[\n str,\n Union[\n Dict[str, Any],\n \"Record\",\n \"TextField\",\n \"MultiLabelQuestion\",\n \"LabelQuestion\",\n \"RatingQuestion\",\n \"TextQuestion\",\n ],\n ],\n ) -> \"ChatType\":\n \"\"\"Format the input into a chat message.\n\n Args:\n input: The input to format.\n\n Returns:\n The formatted chat message.\n\n Raises:\n ValueError: If question or fields are not provided.\n \"\"\"\n input_keys = list(self.inputs.keys())\n record = input[input_keys[0]]\n fields = input.get(input_keys[1], self.fields)\n question = input.get(input_keys[2], self.question)\n examples = input.get(input_keys[3], self.example_records)\n guidelines = input.get(input_keys[4], self.guidelines)\n\n if question is None:\n raise ValueError(\"Question must be provided.\")\n if fields is None or any(field is None for field in fields):\n raise ValueError(\"Fields must be provided.\")\n\n record = record.to_dict() if not isinstance(record, dict) else record\n question = question.serialize() if not isinstance(question, dict) else question\n fields = [\n field.serialize() if not isinstance(field, dict) else field\n for field in fields\n ]\n examples = (\n [\n example.to_dict() if not isinstance(example, dict) else example\n for example in examples\n ]\n if examples\n else None\n )\n\n formatted_fields = self._format_record(record, fields)\n formatted_question = self._format_question(question)\n formatted_examples = (\n self._format_example_records(examples, fields, question)\n if examples\n else False\n )\n\n prompt = self._template.render(\n fields=formatted_fields,\n question=formatted_question,\n examples=formatted_examples,\n guidelines=guidelines,\n )\n\n messages = []\n if self.system_prompt:\n messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n messages.append({\"role\": \"user\", \"content\": prompt})\n return messages\n\n @property\n def outputs(self) -> List[str]:\n return [\"suggestion\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Format the output into a dictionary.\n\n Args:\n output (Union[str, None]): The output to format.\n input (Dict[str, Any]): The input to format.\n\n Returns:\n Dict[str, Any]: The formatted output.\n \"\"\"\n from argilla import Suggestion\n\n question: Union[\n Any,\n Dict[str, Any],\n LabelQuestion,\n MultiLabelQuestion,\n RatingQuestion,\n TextQuestion,\n None,\n ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n question = question.serialize() if not isinstance(question, dict) else question\n model = self._get_pydantic_model_of_structured_output(question)\n validated_output = model(**json.loads(output))\n value = self._get_value_from_question_value_model(validated_output)\n suggestion = Suggestion(\n value=value,\n question_name=question[\"name\"],\n type=\"model\",\n agent=self.llm.model_name,\n ).serialize()\n return {\n self.outputs[0]: {\n k: v\n for k, v in suggestion.items()\n if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n }\n }\n\n def _set_llm_structured_output_for_question(self, question: Dict[str, Any]) -> None:\n runtime_parameters = self.llm._runtime_parameters\n runtime_parameters.update(\n {\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": self._get_pydantic_model_of_structured_output(question),\n },\n }\n )\n self.llm.set_runtime_parameters(runtime_parameters)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Process the input through the task.\n\n Args:\n inputs (StepInput): The input to process.\n\n Returns:\n StepOutput: The output of the task.\n \"\"\"\n\n question_list = [input.get(\"question\", self.question) for input in inputs]\n fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n # check if any field for the field in fields is None\n for fields in fields_list:\n if any(field is None for field in fields):\n raise ValueError(\n \"Fields must be provided during init or through `process` method.\"\n )\n # check if any question is None\n if any(question is None for question in question_list):\n raise ValueError(\n \"Question must be provided during init or through `process` method.\"\n )\n question_list = [\n question.serialize() if not isinstance(question, dict) else question\n for question in question_list\n ]\n if not all(question == question_list[0] for question in question_list):\n warnings.warn(\n \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n stacklevel=2,\n )\n for input, question in zip(inputs, question_list):\n self._set_llm_structured_output_for_question(question)\n yield from super().process([input])\n else:\n question = question_list[0]\n self._set_llm_structured_output_for_question(question)\n yield from super().process(inputs)\n\n def _get_value_from_question_value_model(\n self, question_value_model: BaseModel\n ) -> Any:\n \"\"\"Get the value from the question value model.\n\n Args:\n question_value_model (BaseModel): The question value model to get the value from.\n\n Returns:\n Any: The value from the question value model.\n \"\"\"\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n if hasattr(question_value_model, attr):\n return getattr(question_value_model, attr)\n raise ValueError(f\"Unsupported question type: {question_value_model}\")\n\n def _assign_value_to_question_value_model(\n self, value: Any, question: Dict[str, Any]\n ) -> BaseModel:\n \"\"\"Assign the value to the question value model.\n\n Args:\n value (Any): The value to assign.\n question (Dict[str, Any]): The question to assign the value to.\n\n Returns:\n BaseModel: The question value model with the assigned value.\n \"\"\"\n question_value_model = self._get_pydantic_model_of_structured_output(question)\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n try:\n model_dict = {attr: value}\n question_value_model = question_value_model(**model_dict)\n return question_value_model.model_dump_json()\n except AttributeError:\n pass\n return value\n\n def _get_pydantic_model_of_structured_output(\n self,\n question: Dict[str, Any],\n ) -> BaseModel:\n \"\"\"Get the Pydantic model of the structured output.\n\n Args:\n question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n Returns:\n BaseModel: The Pydantic model of the structured output.\n \"\"\"\n\n question_type = question[\"settings\"][\"type\"]\n\n if question_type == \"multi_label_selection\":\n\n class QuestionValueModel(BaseModel):\n labels: Optional[List[str]] = Field(default_factory=list)\n\n elif question_type == \"label_selection\":\n\n class QuestionValueModel(BaseModel):\n label: str\n\n elif question_type == \"text\":\n\n class QuestionValueModel(BaseModel):\n text: str\n\n elif question_type == \"rating\":\n\n class QuestionValueModel(BaseModel):\n rating: int\n else:\n raise ValueError(f\"Unsupported question type: {question}\")\n\n return QuestionValueModel\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/argilla_labeller.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"argillalabeller.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_record","title":"_format_record(record, fields) ","text":"Format the record fields into a string. Parameters: Name Type Description Default record Dict[str, Any] The record to format. required fields List[Dict[str, Any]] The fields to format. required Returns: Name Type Description str str The formatted record fields. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_record(\n self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n) -> str:\n \"\"\"Format the record fields into a string.\n\n Args:\n record (Dict[str, Any]): The record to format.\n fields (List[Dict[str, Any]]): The fields to format.\n\n Returns:\n str: The formatted record fields.\n \"\"\"\n output = []\n for field in fields:\n output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n return \"fields: \" + \"\\n\".join(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_label_instruction","title":"_get_label_instruction(question) ","text":"Get the label instruction for the question. Parameters: Name Type Description Default question Dict[str, Any] The question to get the label instruction for. required Returns: Name Type Description str str The label instruction for the question. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n \"\"\"Get the label instruction for the question.\n\n Args:\n question (Dict[str, Any]): The question to get the label instruction for.\n\n Returns:\n str: The label instruction for the question.\n \"\"\"\n question_type = question[\"settings\"][\"type\"]\n return self.question_to_label_instruction[question_type]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_question","title":"_format_question(question) ","text":"Format the question settings into a string. Parameters: Name Type Description Default question Dict[str, Any] The question to format. required Returns: Name Type Description str str The formatted question. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_question(self, question: Dict[str, Any]) -> str:\n \"\"\"Format the question settings into a string.\n\n Args:\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted question.\n \"\"\"\n output = []\n output.append(f\"question: {self._get_label_instruction(question)}\")\n if \"options\" in question.get(\"settings\", {}):\n output.append(\n f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n )\n return \"\\n\".join(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_example_records","title":"_format_example_records(records, fields, question) ","text":"Format the example records into a string. Parameters: Name Type Description Default records List[Dict[str, Any]] The records to format. required fields List[Dict[str, Any]] The fields to format. required question Dict[str, Any] The question to format. required Returns: Name Type Description str str The formatted example records. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_example_records(\n self,\n records: List[Dict[str, Any]],\n fields: List[Dict[str, Any]],\n question: Dict[str, Any],\n) -> str:\n \"\"\"Format the example records into a string.\n\n Args:\n records (List[Dict[str, Any]]): The records to format.\n fields (List[Dict[str, Any]]): The fields to format.\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted example records.\n \"\"\"\n base = []\n for record in records:\n responses = record.get(\"responses\", {})\n if responses.get(question[\"name\"]):\n base.append(self._format_record(record, fields))\n value = responses[question[\"name\"]][0][\"value\"]\n formatted_value = self._assign_value_to_question_value_model(\n value, question\n )\n base.append(f\"response: {formatted_value}\")\n base.append(\"\")\n else:\n warnings.warn(\n f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n stacklevel=2,\n )\n return \"\\n\".join(base)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_input","title":"format_input(input) ","text":"Format the input into a chat message. Parameters: Name Type Description Default input Dict[str, Union[Dict[str, Any], Record, TextField, MultiLabelQuestion, LabelQuestion, RatingQuestion, TextQuestion]] The input to format. required Returns: Type Description ChatType The formatted chat message. Raises: Type Description ValueError If question or fields are not provided. Source code in src/distilabel/steps/tasks/argilla_labeller.py def format_input(\n self,\n input: Dict[\n str,\n Union[\n Dict[str, Any],\n \"Record\",\n \"TextField\",\n \"MultiLabelQuestion\",\n \"LabelQuestion\",\n \"RatingQuestion\",\n \"TextQuestion\",\n ],\n ],\n) -> \"ChatType\":\n \"\"\"Format the input into a chat message.\n\n Args:\n input: The input to format.\n\n Returns:\n The formatted chat message.\n\n Raises:\n ValueError: If question or fields are not provided.\n \"\"\"\n input_keys = list(self.inputs.keys())\n record = input[input_keys[0]]\n fields = input.get(input_keys[1], self.fields)\n question = input.get(input_keys[2], self.question)\n examples = input.get(input_keys[3], self.example_records)\n guidelines = input.get(input_keys[4], self.guidelines)\n\n if question is None:\n raise ValueError(\"Question must be provided.\")\n if fields is None or any(field is None for field in fields):\n raise ValueError(\"Fields must be provided.\")\n\n record = record.to_dict() if not isinstance(record, dict) else record\n question = question.serialize() if not isinstance(question, dict) else question\n fields = [\n field.serialize() if not isinstance(field, dict) else field\n for field in fields\n ]\n examples = (\n [\n example.to_dict() if not isinstance(example, dict) else example\n for example in examples\n ]\n if examples\n else None\n )\n\n formatted_fields = self._format_record(record, fields)\n formatted_question = self._format_question(question)\n formatted_examples = (\n self._format_example_records(examples, fields, question)\n if examples\n else False\n )\n\n prompt = self._template.render(\n fields=formatted_fields,\n question=formatted_question,\n examples=formatted_examples,\n guidelines=guidelines,\n )\n\n messages = []\n if self.system_prompt:\n messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n messages.append({\"role\": \"user\", \"content\": prompt})\n return messages\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_output","title":"format_output(output, input) ","text":"Format the output into a dictionary. Parameters: Name Type Description Default output Union[str, None] The output to format. required input Dict[str, Any] The input to format. required Returns: Type Description Dict[str, Any] Dict[str, Any]: The formatted output. Source code in src/distilabel/steps/tasks/argilla_labeller.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Format the output into a dictionary.\n\n Args:\n output (Union[str, None]): The output to format.\n input (Dict[str, Any]): The input to format.\n\n Returns:\n Dict[str, Any]: The formatted output.\n \"\"\"\n from argilla import Suggestion\n\n question: Union[\n Any,\n Dict[str, Any],\n LabelQuestion,\n MultiLabelQuestion,\n RatingQuestion,\n TextQuestion,\n None,\n ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n question = question.serialize() if not isinstance(question, dict) else question\n model = self._get_pydantic_model_of_structured_output(question)\n validated_output = model(**json.loads(output))\n value = self._get_value_from_question_value_model(validated_output)\n suggestion = Suggestion(\n value=value,\n question_name=question[\"name\"],\n type=\"model\",\n agent=self.llm.model_name,\n ).serialize()\n return {\n self.outputs[0]: {\n k: v\n for k, v in suggestion.items()\n if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n }\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.process","title":"process(inputs) ","text":"Process the input through the task. Parameters: Name Type Description Default inputs StepInput The input to process. required Returns: Name Type Description StepOutput StepOutput The output of the task. Source code in src/distilabel/steps/tasks/argilla_labeller.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Process the input through the task.\n\n Args:\n inputs (StepInput): The input to process.\n\n Returns:\n StepOutput: The output of the task.\n \"\"\"\n\n question_list = [input.get(\"question\", self.question) for input in inputs]\n fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n # check if any field for the field in fields is None\n for fields in fields_list:\n if any(field is None for field in fields):\n raise ValueError(\n \"Fields must be provided during init or through `process` method.\"\n )\n # check if any question is None\n if any(question is None for question in question_list):\n raise ValueError(\n \"Question must be provided during init or through `process` method.\"\n )\n question_list = [\n question.serialize() if not isinstance(question, dict) else question\n for question in question_list\n ]\n if not all(question == question_list[0] for question in question_list):\n warnings.warn(\n \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n stacklevel=2,\n )\n for input, question in zip(inputs, question_list):\n self._set_llm_structured_output_for_question(question)\n yield from super().process([input])\n else:\n question = question_list[0]\n self._set_llm_structured_output_for_question(question)\n yield from super().process(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_value_from_question_value_model","title":"_get_value_from_question_value_model(question_value_model) ","text":"Get the value from the question value model. Parameters: Name Type Description Default question_value_model BaseModel The question value model to get the value from. required Returns: Name Type Description Any Any The value from the question value model. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_value_from_question_value_model(\n self, question_value_model: BaseModel\n) -> Any:\n \"\"\"Get the value from the question value model.\n\n Args:\n question_value_model (BaseModel): The question value model to get the value from.\n\n Returns:\n Any: The value from the question value model.\n \"\"\"\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n if hasattr(question_value_model, attr):\n return getattr(question_value_model, attr)\n raise ValueError(f\"Unsupported question type: {question_value_model}\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._assign_value_to_question_value_model","title":"_assign_value_to_question_value_model(value, question) ","text":"Assign the value to the question value model. Parameters: Name Type Description Default value Any The value to assign. required question Dict[str, Any] The question to assign the value to. required Returns: Name Type Description BaseModel BaseModel The question value model with the assigned value. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _assign_value_to_question_value_model(\n self, value: Any, question: Dict[str, Any]\n) -> BaseModel:\n \"\"\"Assign the value to the question value model.\n\n Args:\n value (Any): The value to assign.\n question (Dict[str, Any]): The question to assign the value to.\n\n Returns:\n BaseModel: The question value model with the assigned value.\n \"\"\"\n question_value_model = self._get_pydantic_model_of_structured_output(question)\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n try:\n model_dict = {attr: value}\n question_value_model = question_value_model(**model_dict)\n return question_value_model.model_dump_json()\n except AttributeError:\n pass\n return value\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_pydantic_model_of_structured_output","title":"_get_pydantic_model_of_structured_output(question) ","text":"Get the Pydantic model of the structured output. Parameters: Name Type Description Default question Dict[str, Any] The question to get the Pydantic model of the structured output for. required Returns: Name Type Description BaseModel BaseModel The Pydantic model of the structured output. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_pydantic_model_of_structured_output(\n self,\n question: Dict[str, Any],\n) -> BaseModel:\n \"\"\"Get the Pydantic model of the structured output.\n\n Args:\n question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n Returns:\n BaseModel: The Pydantic model of the structured output.\n \"\"\"\n\n question_type = question[\"settings\"][\"type\"]\n\n if question_type == \"multi_label_selection\":\n\n class QuestionValueModel(BaseModel):\n labels: Optional[List[str]] = Field(default_factory=list)\n\n elif question_type == \"label_selection\":\n\n class QuestionValueModel(BaseModel):\n label: str\n\n elif question_type == \"text\":\n\n class QuestionValueModel(BaseModel):\n text: str\n\n elif question_type == \"rating\":\n\n class QuestionValueModel(BaseModel):\n rating: int\n else:\n raise ValueError(f\"Unsupported question type: {question}\")\n\n return QuestionValueModel\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR","title":"CLAIR ","text":" Bases: Task Contrastive Learning from AI Revisions (CLAIR). CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise. Input columns - task (
str ): The task or instruction. - student_solution (
str ): An answer to the task that is to be revised. Output columns - revision (
str ): The revised text. - rational (
str ): The rational for the provided revision. - model_name (
str ): The name of the model used to generate the revision and rational. Categories - preference
- text-generation
References Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment APO and CLAIR - GitHub Repository Examples: Create contrastive preference pairs: from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n# 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n# {'role': 'user',\n# 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations: ```\n@misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n year={2024},\n eprint={2408.06266},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2408.06266},\n}\n```\n Source code in src/distilabel/steps/tasks/clair.py class CLAIR(Task):\n r\"\"\"Contrastive Learning from AI Revisions (CLAIR).\n\n CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting\n preference A `preferred` A\u2019 is much more contrastive and precise.\n\n Input columns:\n - task (`str`): The task or instruction.\n - student_solution (`str`): An answer to the task that is to be revised.\n\n Output columns:\n - revision (`str`): The revised text.\n - rational (`str`): The rational for the provided revision.\n - model_name (`str`): The name of the model used to generate the revision and rational.\n\n Categories:\n - preference\n - text-generation\n\n References:\n - [`Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment`](https://arxiv.org/abs/2408.06266v1)\n - [`APO and CLAIR - GitHub Repository`](https://github.com/ContextualAI/CLAIR_and_APO)\n\n Examples:\n Create contrastive preference pairs:\n\n ```python\n from distilabel.steps.tasks import CLAIR\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n )\n clair_task = CLAIR(llm=llm)\n\n clair_task.load()\n\n result = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n )\n # result\n # [{'task': 'How many gaps are there between the earth and the moon?',\n # 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n # 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n # 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n # 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n # 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n # 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n # {'role': 'user',\n # 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n\n ```\n @misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n year={2024},\n eprint={2408.06266},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2408.06266},\n }\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"clair.jinja2\"\n )\n with open(_path, \"r\") as f:\n self._template = Template(f.read())\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"task\", \"student_solution\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"revision\", \"rational\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n task=input[\"task\"], student_solution=input[\"student_solution\"]\n ),\n },\n ]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return self._default_error()\n\n return self._format_output(output)\n\n def _format_output(self, output: Union[str, None]) -> Dict[str, Any]:\n if \"**Corrected Student Solution:**\" in output:\n splits = output.split(\"**Corrected Student Solution:**\")\n elif \"{corrected_student_solution}:\" in output:\n splits = output.split(\"{corrected_student_solution}:\")\n elif \"{corrected_student_solution}\" in output:\n splits = output.split(\"{corrected_student_solution}\")\n elif \"**Worsened Student Solution:**\" in output:\n splits = output.split(\"**Worsened Student Solution:**\")\n elif \"{worsened_student_solution}:\" in output:\n splits = output.split(\"{worsened_student_solution}:\")\n elif \"{worsened_student_solution}\" in output:\n splits = output.split(\"{worsened_student_solution}\")\n else:\n splits = None\n\n # Safety check when the output doesn't follow the expected format\n if not splits:\n return self._default_error()\n\n if len(splits) >= 2:\n revision = splits[1]\n revision = revision.strip(\"\\n\\n\").strip() # noqa: B005\n\n rational = splits[0]\n if \"{teacher_reasoning}\" in rational:\n rational = rational.split(\"{teacher_reasoning}\")[1].strip(\":\").strip()\n rational = rational.strip(\"\\n\\n\").strip() # noqa: B005\n else:\n return self._default_error()\n return {\"revision\": revision, \"rational\": rational}\n\n def _default_error(self) -> Dict[str, None]:\n return {\"revision\": None, \"rational\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/clair.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n task=input[\"task\"], student_solution=input[\"student_solution\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction-response pair. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction-response pair. Source code in src/distilabel/steps/tasks/clair.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return self._default_error()\n\n return self._format_output(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer","title":"ComplexityScorer ","text":" Bases: Task Score instructions based on their complexity using an LLM . ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instructions (
List[str] ): The list of instructions to be scored. Output columns - scores (
List[float] ): The score for each instruction. - model_name (
str ): The model name used to generate the scores. Categories - scorer
- complexity
- instruction
References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evaluate the complexity of your instructions: from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n Generate structured output with default schema: from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n \"scores\": [\\n 1, \\n 2\\n ]\\n}'}}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/complexity_scorer.py class ComplexityScorer(Task):\n \"\"\"Score instructions based on their complexity using an `LLM`.\n\n `ComplexityScorer` is a pre-defined task used to rank a list of instructions based in\n their complexity. It's an implementation of the complexity score task from the paper\n 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection\n in Instruction Tuning'.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instructions (`List[str]`): The list of instructions to be scored.\n\n Output columns:\n - scores (`List[float]`): The score for each instruction.\n - model_name (`str`): The model name used to generate the scores.\n\n Categories:\n - scorer\n - complexity\n - instruction\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evaluate the complexity of your instructions:\n\n ```python\n from distilabel.steps.tasks import ComplexityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n )\n # result\n # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n ```\n\n Generate structured output with default schema:\n\n ```python\n from distilabel.steps.tasks import ComplexityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n )\n # result\n # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\\\n \"scores\": [\\\\n 1, \\\\n 2\\\\n ]\\\\n}'}}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"complexity-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are the `instructions`.\"\"\"\n return [\"instructions\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(instructions=input[\"instructions\"]), # type: ignore\n }\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are: a list of `scores` containing the complexity score for each\n instruction in `instructions`, and the `model_name`.\"\"\"\n return [\"scores\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"instructions\"]) - 1:\n break\n return {\"scores\": scores}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaComplexityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaComplexityScorer\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n @override\n def _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {\n \"instructions\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are the instructions . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.outputs","title":"outputs: List[str] property ","text":"The output for the task are: a list of scores containing the complexity score for each instruction in instructions , and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/complexity_scorer.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"complexity-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/complexity_scorer.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(instructions=input[\"instructions\"]), # type: ignore\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction. Source code in src/distilabel/steps/tasks/complexity_scorer.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"instructions\"]) - 1:\n break\n return {\"scores\": scores}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaComplexityScorer(BaseModel):\n scores: List[int]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/complexity_scorer.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaComplexityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaComplexityScorer\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/complexity_scorer.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._sample_input","title":"_sample_input() ","text":"Returns a sample input to be used in the print method. Tasks that don't adhere to a format input that returns a map of the type str -> str should override this method to return a sample input. Source code in src/distilabel/steps/tasks/complexity_scorer.py @override\ndef _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {\n \"instructions\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct","title":"EvolInstruct ","text":" Bases: Task Evolve instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions Attributes: Name Type Description num_evolutions int The number of evolutions to be performed. store_evolutions bool Whether to store all the evolutions or just the last one. Defaults to False . generate_answers bool Whether to generate answers for the evolved instructions. Defaults to False . include_original_instruction bool Whether to include the original instruction in the evolved_instructions output column. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file. seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters seed : The seed to be set for numpy in order to randomly pick a mutation method. Input columns - instruction (
str ): The instruction to evolve. Output columns - evolved_instruction (
str ): The evolved instruction if store_evolutions=False . - evolved_instructions (
List[str] ): The evolved instructions if store_evolutions=True . - model_name (
str ): The name of the LLM used to evolve the instructions. - answer (
str ): The answer to the evolved instruction if generate_answers=True and store_evolutions=False . - answers (
List[str] ): The answers to the evolved instructions if generate_answers=True and store_evolutions=True . Categories References - WizardLM: Empowering Large Language Models to Follow Complex Instructions
- GitHub: h2oai/h2o-wizardlm
Examples: Evolve an instruction using an LLM: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n Keep the iterations of the evolutions: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instructions': ['initial evolution', 'final evolution'],\n# 'model_name': 'model_name'\n# }\n# ]\n Generate answers for the instructions in a single step: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instruction': 'evolved instruction',\n# 'answer': 'answer to the instruction',\n# 'model_name': 'model_name'\n# }\n# ]\n Citations @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/base.py class EvolInstruct(Task):\n \"\"\"Evolve instructions using an `LLM`.\n\n WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n Attributes:\n num_evolutions: The number of evolutions to be performed.\n store_evolutions: Whether to store all the evolutions or just the last one. Defaults\n to `False`.\n generate_answers: Whether to generate answers for the evolved instructions. Defaults\n to `False`.\n include_original_instruction: Whether to include the original instruction in the\n `evolved_instructions` output column. Defaults to `False`.\n mutation_templates: The mutation templates to be used for evolving the instructions.\n Defaults to the ones provided in the `utils.py` file.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Input columns:\n - instruction (`str`): The instruction to evolve.\n\n Output columns:\n - evolved_instruction (`str`): The evolved instruction if `store_evolutions=False`.\n - evolved_instructions (`List[str]`): The evolved instructions if `store_evolutions=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n - answer (`str`): The answer to the evolved instruction if `generate_answers=True`\n and `store_evolutions=False`.\n - answers (`List[str]`): The answers to the evolved instructions if `generate_answers=True`\n and `store_evolutions=True`.\n\n Categories:\n - evol\n - instruction\n\n References:\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n Examples:\n Evolve an instruction using an LLM:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n ```\n\n Keep the iterations of the evolutions:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'evolved_instructions': ['initial evolution', 'final evolution'],\n # 'model_name': 'model_name'\n # }\n # ]\n ```\n\n Generate answers for the instructions in a single step:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'evolved_instruction': 'evolved instruction',\n # 'answer': 'answer to the instruction',\n # 'model_name': 'model_name'\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n num_evolutions: int\n store_evolutions: bool = False\n generate_answers: bool = False\n include_original_instruction: bool = False\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"instruction\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `evolved_instruction/s`, the `answer` if `generate_answers=True`\n and the `model_name`.\"\"\"\n # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n # this could be handled always and the value could be included within the DAG validation when\n # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n _outputs = [\n (\n \"evolved_instruction\"\n if not self.store_evolutions\n else \"evolved_instructions\"\n ),\n \"model_name\",\n ]\n if self.generate_answers:\n _outputs.append(\"answer\" if not self.store_evolutions else \"answers\")\n return _outputs\n\n @override\n def format_output( # type: ignore\n self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n Args:\n instructions: The instructions to be included within the output.\n answers: The answers to be included within the output if `generate_answers=True`.\n\n Returns:\n If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n if not self.store_evolutions:\n _output[\"evolved_instruction\"] = instructions[-1]\n else:\n _output[\"evolved_instructions\"] = instructions\n\n if self.generate_answers and answers:\n if not self.store_evolutions:\n _output[\"answer\"] = answers[-1]\n else:\n _output[\"answers\"] = answers\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, instruction: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction) # type: ignore\n\n def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n statistics: \"LLMStatistics\" = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction in instructions:\n formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n evolved_instructions = []\n for generated_prompt in generated_prompts:\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n evolved_instructions.append(generated_prompt)\n\n if self.store_evolutions:\n instructions = [\n instruction + [evolved_instruction]\n for instruction, evolved_instruction in zip(\n instructions, evolved_instructions\n )\n ]\n else:\n instructions = [\n [evolved_instruction]\n for evolved_instruction in evolved_instructions\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n )\n return instructions, dict(statistics)\n\n def _generate_answers(\n self, evolved_instructions: List[List[str]]\n ) -> Tuple[List[List[str]], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the instructions in `instructions`.\n\n Args:\n evolved_instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for each instruction.\n \"\"\"\n formatted_instructions = [\n self.format_input(instruction)\n for instructions in evolved_instructions\n for instruction in instructions\n ]\n\n responses = self.llm.generate(\n formatted_instructions,\n num_generations=1,\n **self.llm.generation_kwargs, # type: ignore\n )\n generations = [response[\"generations\"] for response in responses]\n\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n step = (\n self.num_evolutions\n if not self.include_original_instruction\n else self.num_evolutions + 1\n )\n\n return [\n flatten_responses(generations[i : i + step])\n for i in range(0, len(responses), step)\n ], dict(statistics)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n evolved_instructions, statistics = self._evolve_instructions(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_instructions` list\n from_ = 1 if not self.include_original_instruction else 0\n evolved_instructions = [\n instruction[from_:] for instruction in evolved_instructions\n ]\n\n if not self.generate_answers:\n for input, instruction in zip(inputs, evolved_instructions):\n input.update(self.format_output(instruction))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_instruction_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n self._logger.info(\n f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n )\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(evolved_instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n \" instructions!\"\n )\n\n for idx, (input, instruction) in enumerate(\n zip(inputs, evolved_instructions)\n ):\n input.update(self.format_output(instruction, answers[idx]))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_answer_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n self._apply_random_mutation(\"<PLACEHOLDER_INSTRUCTION>\")\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.outputs","title":"outputs: List[str] property ","text":"The output for the task are the evolved_instruction/s , the answer if generate_answers=True and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_output","title":"format_output(instructions, answers=None) ","text":"The output for the task is a dict with: evolved_instruction or evolved_instructions , depending whether the value is either False or True for store_evolutions , respectively; answer if generate_answers=True ; and, finally, the model_name . Parameters: Name Type Description Default instructions Union[str, List[str]] The instructions to be included within the output. required answers Optional[List[str]] The answers to be included within the output if generate_answers=True . None Returns: Type Description Dict[str, Any] If store_evolutions=False and generate_answers=True return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...}; Dict[str, Any] if store_evolutions=True and generate_answers=True return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...}; Dict[str, Any] if store_evolutions=False and generate_answers=False return {\"evolved_instruction\": ..., \"model_name\": ...}; Dict[str, Any] if store_evolutions=True and generate_answers=False return {\"evolved_instructions\": ..., \"model_name\": ...}. Source code in src/distilabel/steps/tasks/evol_instruct/base.py @override\ndef format_output( # type: ignore\n self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n Args:\n instructions: The instructions to be included within the output.\n answers: The answers to be included within the output if `generate_answers=True`.\n\n Returns:\n If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n if not self.store_evolutions:\n _output[\"evolved_instruction\"] = instructions[-1]\n else:\n _output[\"evolved_instructions\"] = instructions\n\n if self.generate_answers and answers:\n if not self.store_evolutions:\n _output[\"answer\"] = answers[-1]\n else:\n _output[\"answers\"] = answers\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._apply_random_mutation","title":"_apply_random_mutation(instruction) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default instruction str The instruction to be included within the mutation prompt. required Returns: Type Description str A random mutation prompt with the provided instruction. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _apply_random_mutation(self, instruction: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction) # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._evolve_instructions","title":"_evolve_instructions(inputs) ","text":"Evolves the instructions provided as part of the inputs of the task. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description List[List[str]] A list where each item is a list with either the last evolved instruction if List[List[str]] store_evolutions=False or all the evolved instructions if store_evolutions=True . Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n statistics: \"LLMStatistics\" = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction in instructions:\n formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n evolved_instructions = []\n for generated_prompt in generated_prompts:\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n evolved_instructions.append(generated_prompt)\n\n if self.store_evolutions:\n instructions = [\n instruction + [evolved_instruction]\n for instruction, evolved_instruction in zip(\n instructions, evolved_instructions\n )\n ]\n else:\n instructions = [\n [evolved_instruction]\n for evolved_instruction in evolved_instructions\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n )\n return instructions, dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._generate_answers","title":"_generate_answers(evolved_instructions) ","text":"Generates the answer for the instructions in instructions . Parameters: Name Type Description Default evolved_instructions List[List[str]] A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True . required Returns: Type Description Tuple[List[List[str]], LLMStatistics] A list of answers for each instruction. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _generate_answers(\n self, evolved_instructions: List[List[str]]\n) -> Tuple[List[List[str]], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the instructions in `instructions`.\n\n Args:\n evolved_instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for each instruction.\n \"\"\"\n formatted_instructions = [\n self.format_input(instruction)\n for instructions in evolved_instructions\n for instruction in instructions\n ]\n\n responses = self.llm.generate(\n formatted_instructions,\n num_generations=1,\n **self.llm.generation_kwargs, # type: ignore\n )\n generations = [response[\"generations\"] for response in responses]\n\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n step = (\n self.num_evolutions\n if not self.include_original_instruction\n else self.num_evolutions + 1\n )\n\n return [\n flatten_responses(generations[i : i + step])\n for i in range(0, len(responses), step)\n ], dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/evol_instruct/base.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n evolved_instructions, statistics = self._evolve_instructions(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_instructions` list\n from_ = 1 if not self.include_original_instruction else 0\n evolved_instructions = [\n instruction[from_:] for instruction in evolved_instructions\n ]\n\n if not self.generate_answers:\n for input, instruction in zip(inputs, evolved_instructions):\n input.update(self.format_output(instruction))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_instruction_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n self._logger.info(\n f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n )\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(evolved_instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n \" instructions!\"\n )\n\n for idx, (input, instruction) in enumerate(\n zip(inputs, evolved_instructions)\n ):\n input.update(self.format_output(instruction, answers[idx]))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_answer_{self.name}\": statistics\n }\n }\n )\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexity","title":"EvolComplexity ","text":" Bases: EvolInstruct Evolve instructions to make them more complex using an LLM . EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach. Attributes: Name Type Description num_instructions The number of instructions to be generated. generate_answers Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed Dict[str, str] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The number of evolutions to be run. Input columns - instruction (
str ): The instruction to evolve. Output columns - evolved_instruction (
str ): The evolved instruction. - answer (
str , optional): The answer to the instruction if generate_answers=True . - model_name (
str ): The name of the LLM used to evolve the instructions. Categories References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
- WizardLM: Empowering Large Language Models to Follow Complex Instructions
Examples: Evolve an instruction using an LLM: from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py class EvolComplexity(EvolInstruct):\n \"\"\"Evolve instructions to make them more complex using an `LLM`.\n\n `EvolComplexity` is a task that evolves instructions to make them more complex,\n and it is based in the EvolInstruct task, using slight different prompts, but the\n exact same evolutionary approach.\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n - `seed`: The number of evolutions to be run.\n\n Input columns:\n - instruction (`str`): The instruction to evolve.\n\n Output columns:\n - evolved_instruction (`str`): The evolved instruction.\n - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - deita\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n Examples:\n Evolve an instruction using an LLM:\n\n ```python\n from distilabel.steps.tasks import EvolComplexity\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_complexity.load()\n\n result = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexityGenerator","title":"EvolComplexityGenerator ","text":" Bases: EvolInstructGenerator Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach. Attributes: Name Type Description num_instructions The number of instructions to be generated. generate_answers Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed Dict[str, str] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The number of evolutions to be run. Output columns - instruction (
str ): The evolved instruction. - answer (
str , optional): The answer to the instruction if generate_answers=True . - model_name (
str ): The name of the LLM used to evolve the instructions. Categories - evol
- instruction
- generation
- deita
References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
- WizardLM: Empowering Large Language Models to Follow Complex Instructions
Examples: Generate evolved instructions without initial instructions: from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py class EvolComplexityGenerator(EvolInstructGenerator):\n \"\"\"Generate evolved instructions with increased complexity using an `LLM`.\n\n `EvolComplexityGenerator` is a generation task that evolves instructions to make\n them more complex, and it is based in the EvolInstruct task, but using slight different\n prompts, but the exact same evolutionary approach.\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n - `seed`: The number of evolutions to be run.\n\n Output columns:\n - instruction (`str`): The evolved instruction.\n - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - generation\n - deita\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n Examples:\n Generate evolved instructions without initial instructions:\n\n ```python\n from distilabel.steps.tasks import EvolComplexityGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n )\n\n evol_complexity_generator.load()\n\n result = next(scorer.process())\n # result\n # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator","title":"EvolInstructGenerator ","text":" Bases: GeneratorTask Generate evolved instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions Attributes: Name Type Description num_instructions int The number of instructions to be generated. generate_answers bool Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length RuntimeParameter[int] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length RuntimeParameter[int] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The seed to be set for numpy in order to randomly pick a mutation method. Output columns - instruction (
str ): The generated instruction if generate_answers=False . - answer (
str ): The generated answer if generate_answers=True . - instructions (
List[str] ): The generated instructions if generate_answers=True . - model_name (
str ): The name of the LLM used to generate and evolve the instructions. Categories - evol
- instruction
- generation
References - WizardLM: Empowering Large Language Models to Follow Complex Instructions
- GitHub: h2oai/h2o-wizardlm
Examples: Generate evolved instructions without initial instructions: from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n Citations @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/generator.py class EvolInstructGenerator(GeneratorTask):\n \"\"\"Generate evolved instructions using an `LLM`.\n\n WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs\n to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs\n to be lower than, to be considered valid.\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Output columns:\n - instruction (`str`): The generated instruction if `generate_answers=False`.\n - answer (`str`): The generated answer if `generate_answers=True`.\n - instructions (`List[str]`): The generated instructions if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to generate and evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - generation\n\n References:\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n Examples:\n Generate evolved instructions without initial instructions:\n\n ```python\n from distilabel.steps.tasks import EvolInstructGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n )\n\n evol_instruct_generator.load()\n\n result = next(scorer.process())\n # result\n # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n ```\n\n Citations:\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n num_instructions: int\n generate_answers: bool = False\n mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n\n min_length: RuntimeParameter[int] = Field(\n default=512,\n description=\"Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\",\n )\n max_length: RuntimeParameter[int] = Field(\n default=1024,\n description=\"Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\",\n )\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n )\n _seed_texts: Optional[List[str]] = PrivateAttr(default_factory=list)\n _prompts: Optional[List[str]] = PrivateAttr(default_factory=list)\n\n def _generate_seed_texts(self) -> List[str]:\n \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n a list of English words will be used to generate the seed texts that will be provided to the\n mutation method and included within the prompt.\n\n Returns:\n A list of seed texts to be used as part of the starting prompts for the task.\n \"\"\"\n seed_texts = []\n for _ in range(self.num_instructions * 10):\n num_words = np.random.choice([1, 2, 3, 4])\n seed_texts.append(\n self.mutation_templates[\"FRESH_START\"].replace( # type: ignore\n \"<PROMPT>\",\n \", \".join(\n [\n np.random.choice(self._english_nouns).strip()\n for _ in range(num_words)\n ]\n ),\n )\n )\n return seed_texts\n\n @override\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n np.random.seed(self.seed)\n\n self._seed_texts = self._generate_seed_texts()\n self._prompts = [\n np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n ]\n\n @cached_property\n def _english_nouns(self) -> List[str]:\n \"\"\"A list of English nouns to be used as part of the starting prompts for the task.\n\n References:\n - https://github.com/h2oai/h2o-wizardlm\n \"\"\"\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps/tasks/evol_instruct/english_nouns.txt\"\n )\n with open(_path, mode=\"r\") as f:\n return [line.strip() for line in f.readlines()]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `instruction`, the `answer` if `generate_answers=True`\n and the `model_name`.\"\"\"\n _outputs = [\"instruction\", \"model_name\"]\n if self.generate_answers:\n _outputs.append(\"answer\")\n return _outputs\n\n def format_output( # type: ignore\n self, instruction: str, answer: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n and, finally, the `model_name`.\n\n Args:\n instruction: The instruction to be included within the output.\n answer: The answer to be included within the output if `generate_answers=True`.\n\n Returns:\n If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n \"\"\"\n _output = {\n \"instruction\": instruction,\n \"model_name\": self.llm.model_name,\n }\n if self.generate_answers and answer is not None:\n _output[\"answer\"] = answer\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n iter_no: The iteration number to be used to check whether the iteration is the\n first one i.e. FRESH_START, or not.\n\n Returns:\n A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n \"\"\"\n prompts = []\n for idx in range(self.num_instructions):\n if (\n iter_no == 0\n or \"Write one question or request containing\" in self._prompts[idx] # type: ignore\n ):\n mutation = \"FRESH_START\"\n else:\n mutation = np.random.choice(self.mutation_templates_names)\n if mutation == \"FRESH_START\":\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n\n prompt_with_template = (\n self.mutation_templates[mutation].replace( # type: ignore\n \"<PROMPT>\",\n self._prompts[idx], # type: ignore\n ) # type: ignore\n if iter_no != 0\n else self._prompts[idx] # type: ignore\n )\n prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n return prompts\n\n def _generate_answers(\n self, instructions: List[List[str]]\n ) -> Tuple[List[str], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the last instruction in `instructions`.\n\n Args:\n instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for the last instruction in `instructions`.\n \"\"\"\n # TODO: update to generate answers for all the instructions\n _formatted_instructions = [\n [{\"role\": \"user\", \"content\": instruction[-1]}]\n for instruction in instructions\n ]\n responses = self.llm.generate(\n _formatted_instructions,\n **self.llm.generation_kwargs, # type: ignore\n )\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n return flatten_responses(\n [response[\"generations\"] for response in responses]\n ), dict(statistics)\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # NOQA: C901, type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n A list of Python dictionaries with the outputs of the task, and a boolean\n flag indicating whether the task has finished or not i.e. is the last batch.\n \"\"\"\n instructions = []\n mutation_no = 0\n\n # TODO: update to take into account `offset`\n iter_no = 0\n while len(instructions) < self.num_instructions:\n prompts = self._apply_random_mutation(iter_no=iter_no)\n\n # TODO: Update the function to extract from the dict\n responses = self.llm.generate(prompts, **self.llm.generation_kwargs) # type: ignore\n\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n statistics: \"LLMStatistics\" = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n for idx, generated_prompt in enumerate(generated_prompts):\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n if self.max_length >= len(generated_prompt) >= self.min_length: # type: ignore\n instructions.append(generated_prompt)\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n else:\n self._prompts[idx] = generated_prompt # type: ignore\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n )\n iter_no += 1\n\n if len(instructions) > self.num_instructions:\n instructions = instructions[: self.num_instructions]\n if len(instructions) > mutation_no:\n mutation_no = len(instructions) - mutation_no\n\n if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n formatted_generations = []\n for mutated_instruction in instructions[-mutation_no:]:\n mutated_instruction = self.format_output(mutated_instruction)\n mutated_instruction[\"distilabel_metadata\"] = {\n f\"statistics_instruction_{self.name}\": dict(statistics)\n }\n formatted_generations.append(mutated_instruction)\n yield (\n formatted_generations,\n len(instructions) >= self.num_instructions,\n )\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n formatted_outputs = []\n for instruction, answer in zip(instructions, answers):\n formatted_output = self.format_output(instruction, answer)\n formatted_output[\"distilabel_metadata\"] = {\n f\"statistics_answer_{self.name}\": dict(statistics)\n }\n formatted_outputs.append(formatted_output)\n\n yield (\n formatted_outputs,\n True,\n )\n\n @override\n def _sample_input(self) -> \"ChatType\":\n return self._apply_random_mutation(iter_no=0)[0]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._english_nouns","title":"_english_nouns: List[str] cached property ","text":"A list of English nouns to be used as part of the starting prompts for the task. References - https://github.com/h2oai/h2o-wizardlm
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.outputs","title":"outputs: List[str] property ","text":"The output for the task are the instruction , the answer if generate_answers=True and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_seed_texts","title":"_generate_seed_texts() ","text":"Generates a list of seed texts to be used as part of the starting prompts for the task. It will use the FRESH_START mutation template, as it needs to generate text from scratch; and a list of English words will be used to generate the seed texts that will be provided to the mutation method and included within the prompt. Returns: Type Description List[str] A list of seed texts to be used as part of the starting prompts for the task. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _generate_seed_texts(self) -> List[str]:\n \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n a list of English words will be used to generate the seed texts that will be provided to the\n mutation method and included within the prompt.\n\n Returns:\n A list of seed texts to be used as part of the starting prompts for the task.\n \"\"\"\n seed_texts = []\n for _ in range(self.num_instructions * 10):\n num_words = np.random.choice([1, 2, 3, 4])\n seed_texts.append(\n self.mutation_templates[\"FRESH_START\"].replace( # type: ignore\n \"<PROMPT>\",\n \", \".join(\n [\n np.random.choice(self._english_nouns).strip()\n for _ in range(num_words)\n ]\n ),\n )\n )\n return seed_texts\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.model_post_init","title":"model_post_init(__context) ","text":"Override this method to perform additional initialization after __init__ and model_construct . This is useful if you want to do some validation that requires the entire model to be initialized. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py @override\ndef model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n np.random.seed(self.seed)\n\n self._seed_texts = self._generate_seed_texts()\n self._prompts = [\n np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.format_output","title":"format_output(instruction, answer=None) ","text":"The output for the task is a dict with: instruction ; answer if generate_answers=True ; and, finally, the model_name . Parameters: Name Type Description Default instruction str The instruction to be included within the output. required answer Optional[str] The answer to be included within the output if generate_answers=True . None Returns: Type Description Dict[str, Any] If generate_answers=True return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...}; Dict[str, Any] if generate_answers=False return {\"instruction\": ..., \"model_name\": ...}; Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def format_output( # type: ignore\n self, instruction: str, answer: Optional[str] = None\n) -> Dict[str, Any]:\n \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n and, finally, the `model_name`.\n\n Args:\n instruction: The instruction to be included within the output.\n answer: The answer to be included within the output if `generate_answers=True`.\n\n Returns:\n If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n \"\"\"\n _output = {\n \"instruction\": instruction,\n \"model_name\": self.llm.model_name,\n }\n if self.generate_answers and answer is not None:\n _output[\"answer\"] = answer\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._apply_random_mutation","title":"_apply_random_mutation(iter_no) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default iter_no int The iteration number to be used to check whether the iteration is the first one i.e. FRESH_START, or not. required Returns: Type Description List[ChatType] A random mutation prompt with the provided instruction formatted as an OpenAI conversation. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n iter_no: The iteration number to be used to check whether the iteration is the\n first one i.e. FRESH_START, or not.\n\n Returns:\n A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n \"\"\"\n prompts = []\n for idx in range(self.num_instructions):\n if (\n iter_no == 0\n or \"Write one question or request containing\" in self._prompts[idx] # type: ignore\n ):\n mutation = \"FRESH_START\"\n else:\n mutation = np.random.choice(self.mutation_templates_names)\n if mutation == \"FRESH_START\":\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n\n prompt_with_template = (\n self.mutation_templates[mutation].replace( # type: ignore\n \"<PROMPT>\",\n self._prompts[idx], # type: ignore\n ) # type: ignore\n if iter_no != 0\n else self._prompts[idx] # type: ignore\n )\n prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n return prompts\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_answers","title":"_generate_answers(instructions) ","text":"Generates the answer for the last instruction in instructions . Parameters: Name Type Description Default instructions List[List[str]] A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True . required Returns: Type Description Tuple[List[str], LLMStatistics] A list of answers for the last instruction in instructions . Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _generate_answers(\n self, instructions: List[List[str]]\n) -> Tuple[List[str], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the last instruction in `instructions`.\n\n Args:\n instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for the last instruction in `instructions`.\n \"\"\"\n # TODO: update to generate answers for all the instructions\n _formatted_instructions = [\n [{\"role\": \"user\", \"content\": instruction[-1]}]\n for instruction in instructions\n ]\n responses = self.llm.generate(\n _formatted_instructions,\n **self.llm.generation_kwargs, # type: ignore\n )\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n return flatten_responses(\n [response[\"generations\"] for response in responses]\n ), dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.process","title":"process(offset=0) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries with the outputs of the task, and a boolean GeneratorStepOutput flag indicating whether the task has finished or not i.e. is the last batch. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # NOQA: C901, type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n A list of Python dictionaries with the outputs of the task, and a boolean\n flag indicating whether the task has finished or not i.e. is the last batch.\n \"\"\"\n instructions = []\n mutation_no = 0\n\n # TODO: update to take into account `offset`\n iter_no = 0\n while len(instructions) < self.num_instructions:\n prompts = self._apply_random_mutation(iter_no=iter_no)\n\n # TODO: Update the function to extract from the dict\n responses = self.llm.generate(prompts, **self.llm.generation_kwargs) # type: ignore\n\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n statistics: \"LLMStatistics\" = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n for idx, generated_prompt in enumerate(generated_prompts):\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n if self.max_length >= len(generated_prompt) >= self.min_length: # type: ignore\n instructions.append(generated_prompt)\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n else:\n self._prompts[idx] = generated_prompt # type: ignore\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n )\n iter_no += 1\n\n if len(instructions) > self.num_instructions:\n instructions = instructions[: self.num_instructions]\n if len(instructions) > mutation_no:\n mutation_no = len(instructions) - mutation_no\n\n if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n formatted_generations = []\n for mutated_instruction in instructions[-mutation_no:]:\n mutated_instruction = self.format_output(mutated_instruction)\n mutated_instruction[\"distilabel_metadata\"] = {\n f\"statistics_instruction_{self.name}\": dict(statistics)\n }\n formatted_generations.append(mutated_instruction)\n yield (\n formatted_generations,\n len(instructions) >= self.num_instructions,\n )\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n formatted_outputs = []\n for instruction, answer in zip(instructions, answers):\n formatted_output = self.format_output(instruction, answer)\n formatted_output[\"distilabel_metadata\"] = {\n f\"statistics_answer_{self.name}\": dict(statistics)\n }\n formatted_outputs.append(formatted_output)\n\n yield (\n formatted_outputs,\n True,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality","title":"EvolQuality ","text":" Bases: Task Evolve the quality of the responses using an LLM . EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description num_evolutions int The number of evolutions to be performed on the responses. store_evolutions bool Whether to store all the evolved responses or just the last one. Defaults to False . include_original_response bool Whether to include the original response within the evolved responses. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used to evolve the responses. seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters seed : The seed to be set for numpy in order to randomly pick a mutation method. Input columns - instruction (
str ): The instruction that was used to generate the responses . - response (
str ): The responses to be rewritten. Output columns - evolved_response (
str ): The evolved response if store_evolutions=False . - evolved_responses (
List[str] ): The evolved responses if store_evolutions=True . - model_name (
str ): The name of the LLM used to evolve the responses. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evolve the quality of the responses given a prompt: from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'response': 'a response',\n# 'evolved_response': 'evolved response',\n# 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n# }\n# ]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/evol_quality/base.py class EvolQuality(Task):\n \"\"\"Evolve the quality of the responses using an `LLM`.\n\n `EvolQuality` task is used to evolve the quality of the responses given a prompt,\n by generating a new response with a language model. This step implements the evolution\n quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n num_evolutions: The number of evolutions to be performed on the responses.\n store_evolutions: Whether to store all the evolved responses or just the last one.\n Defaults to `False`.\n include_original_response: Whether to include the original response within the evolved\n responses. Defaults to `False`.\n mutation_templates: The mutation templates to be used to evolve the responses.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the `responses`.\n - response (`str`): The responses to be rewritten.\n\n Output columns:\n - evolved_response (`str`): The evolved response if `store_evolutions=False`.\n - evolved_responses (`List[str]`): The evolved responses if `store_evolutions=True`.\n - model_name (`str`): The name of the LLM used to evolve the responses.\n\n Categories:\n - evol\n - response\n - deita\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evolve the quality of the responses given a prompt:\n\n ```python\n from distilabel.steps.tasks import EvolQuality\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_quality.load()\n\n result = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'response': 'a response',\n # 'evolved_response': 'evolved response',\n # 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n num_evolutions: int\n store_evolutions: bool = False\n include_original_response: bool = False\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to set a random seed.\",\n )\n\n @override\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `instruction` and `response`.\"\"\"\n return [\"instruction\", \"response\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `evolved_response/s` and the `model_name`.\"\"\"\n # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n # this could be handled always and the value could be included within the DAG validation when\n # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n _outputs = [\n (\"evolved_response\" if not self.store_evolutions else \"evolved_responses\"),\n \"model_name\",\n ]\n\n return _outputs\n\n def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n and, finally, the `model_name`.\n\n Args:\n responses: The responses to be included within the output.\n\n Returns:\n if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n\n if not self.store_evolutions:\n _output[\"evolved_response\"] = responses[-1]\n else:\n _output[\"evolved_responses\"] = responses\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates` enum.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, instruction: str, response: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return (\n self.mutation_templates[mutation]\n .replace(\"<PROMPT>\", instruction)\n .replace(\"<RESPONSE>\", response)\n )\n\n def _evolve_reponses(\n self, inputs: \"StepInput\"\n ) -> Tuple[List[List[str]], Dict[str, Any]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n np.random.seed(self.seed)\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n statistics: Dict[str, Any] = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction, response in zip(instructions, responses):\n formatted_prompts.append(\n self._apply_random_mutation(instruction[-1], response[-1])\n )\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n\n generated_responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n for response in generated_responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n if self.store_evolutions:\n responses = [\n response + [evolved_response[\"generations\"][0]]\n for response, evolved_response in zip(\n responses, generated_responses\n )\n ]\n else:\n responses = [\n [evolved_response[\"generations\"][0]]\n for evolved_response in generated_responses\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n )\n\n return responses, dict(statistics)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n responses, statistics = self._evolve_reponses(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_responses` list\n from_ = 1 if not self.include_original_response else 0\n responses = [response[from_:] for response in responses]\n\n for input, response in zip(inputs, responses):\n input.update(self.format_output(response))\n input.update(\n {\"distilabel_metadata\": {f\"statistics_{self.name}\": statistics}}\n )\n yield inputs\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\"<PLACEHOLDER_INSTRUCTION>\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.inputs","title":"inputs: List[str] property ","text":"The input for the task are the instruction and response . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.outputs","title":"outputs: List[str] property ","text":"The output for the task are the evolved_response/s and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates enum. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.model_post_init","title":"model_post_init(__context) ","text":"Override this method to perform additional initialization after __init__ and model_construct . This is useful if you want to do some validation that requires the entire model to be initialized. Source code in src/distilabel/steps/tasks/evol_quality/base.py @override\ndef model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists. Source code in src/distilabel/steps/tasks/evol_quality/base.py def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_output","title":"format_output(responses) ","text":"The output for the task is a dict with: evolved_response or evolved_responses , depending whether the value is either False or True for store_evolutions , respectively; and, finally, the model_name . Parameters: Name Type Description Default responses Union[str, List[str]] The responses to be included within the output. required Returns: Type Description Dict[str, Any] if store_evolutions=False return {\"evolved_response\": ..., \"model_name\": ...}; Dict[str, Any] if store_evolutions=True return {\"evolved_responses\": ..., \"model_name\": ...}. Source code in src/distilabel/steps/tasks/evol_quality/base.py def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n and, finally, the `model_name`.\n\n Args:\n responses: The responses to be included within the output.\n\n Returns:\n if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n\n if not self.store_evolutions:\n _output[\"evolved_response\"] = responses[-1]\n else:\n _output[\"evolved_responses\"] = responses\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._apply_random_mutation","title":"_apply_random_mutation(instruction, response) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default instruction str The instruction to be included within the mutation prompt. required Returns: Type Description str A random mutation prompt with the provided instruction. Source code in src/distilabel/steps/tasks/evol_quality/base.py def _apply_random_mutation(self, instruction: str, response: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return (\n self.mutation_templates[mutation]\n .replace(\"<PROMPT>\", instruction)\n .replace(\"<RESPONSE>\", response)\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._evolve_reponses","title":"_evolve_reponses(inputs) ","text":"Evolves the instructions provided as part of the inputs of the task. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description List[List[str]] A list where each item is a list with either the last evolved instruction if Dict[str, Any] store_evolutions=False or all the evolved instructions if store_evolutions=True . Source code in src/distilabel/steps/tasks/evol_quality/base.py def _evolve_reponses(\n self, inputs: \"StepInput\"\n) -> Tuple[List[List[str]], Dict[str, Any]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n np.random.seed(self.seed)\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n statistics: Dict[str, Any] = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction, response in zip(instructions, responses):\n formatted_prompts.append(\n self._apply_random_mutation(instruction[-1], response[-1])\n )\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n\n generated_responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n for response in generated_responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n if self.store_evolutions:\n responses = [\n response + [evolved_response[\"generations\"][0]]\n for response, evolved_response in zip(\n responses, generated_responses\n )\n ]\n else:\n responses = [\n [evolved_response[\"generations\"][0]]\n for evolved_response in generated_responses\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n )\n\n return responses, dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/evol_quality/base.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n responses, statistics = self._evolve_reponses(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_responses` list\n from_ = 1 if not self.include_original_response else 0\n responses = [response[from_:] for response in responses]\n\n for input, response in zip(inputs, responses):\n input.update(self.format_output(response))\n input.update(\n {\"distilabel_metadata\": {f\"statistics_{self.name}\": statistics}}\n )\n yield inputs\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings","title":"GenerateEmbeddings ","text":" Bases: Step Generate embeddings using the last hidden state of an LLM . Generate embeddings for a text input using the last hidden state of an LLM , as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description llm LLM The LLM to use to generate the embeddings. Input columns - text (
str , List[Dict[str, str]] ): The input text or conversation to generate embeddings for. Output columns - embedding (
List[float] ): The embedding of the input text or conversation. - model_name (
str ): The model name used to generate the embeddings. Categories References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
Examples: Rank LLM candidates: from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n)\nembedder.load()\n\nresult = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n)\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/generate_embeddings.py class GenerateEmbeddings(Step):\n \"\"\"Generate embeddings using the last hidden state of an `LLM`.\n\n Generate embeddings for a text input using the last hidden state of an `LLM`, as\n described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n llm: The `LLM` to use to generate the embeddings.\n\n Input columns:\n - text (`str`, `List[Dict[str, str]]`): The input text or conversation to generate\n embeddings for.\n\n Output columns:\n - embedding (`List[float]`): The embedding of the input text or conversation.\n - model_name (`str`): The model name used to generate the embeddings.\n\n Categories:\n - embedding\n - llm\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Rank LLM candidates:\n\n ```python\n from distilabel.steps.tasks import GenerateEmbeddings\n from distilabel.models.llms.huggingface import TransformersLLM\n\n # Consider this as a placeholder for your actual LLM.\n embedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n )\n embedder.load()\n\n result = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n )\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n llm: LLM\n\n def load(self) -> None:\n \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n super().load()\n\n self.llm.load()\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task is a `text` column containing either a string or a\n list of dictionaries in OpenAI chat-like format.\"\"\"\n return [\"text\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task is an `embedding` column containing the embedding of\n the `text` input.\"\"\"\n return [\"embedding\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n can be in `ChatType` format or a string. If a string, it will be converted to a\n list of dictionaries in OpenAI chat-like format.\n\n Args:\n input: The input to format.\n\n Returns:\n The OpenAI chat-like format of the input.\n \"\"\"\n text = input[\"text\"] = input[\"text\"]\n\n # input is in `ChatType` format\n if isinstance(text, str):\n return [{\"role\": \"user\", \"content\": text}]\n\n if is_openai_format(text):\n return text\n\n raise DistilabelUserError(\n f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n page=\"components-gallery/tasks/generateembeddings/\",\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n formatted_inputs = [self.format_input(input) for input in inputs]\n last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n for input, hidden_state in zip(inputs, last_hidden_states):\n input[\"embedding\"] = hidden_state[-1].tolist()\n input[\"model_name\"] = self.llm.model_name\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task is a text column containing either a string or a list of dictionaries in OpenAI chat-like format. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task is an embedding column containing the embedding of the text input. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.load","title":"load() ","text":"Loads the LLM used to generate the embeddings. Source code in src/distilabel/steps/tasks/generate_embeddings.py def load(self) -> None:\n \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n super().load()\n\n self.llm.load()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.format_input","title":"format_input(input) ","text":"Formats the input to be used by the LLM to generate the embeddings. The input can be in ChatType format or a string. If a string, it will be converted to a list of dictionaries in OpenAI chat-like format. Parameters: Name Type Description Default input Dict[str, Any] The input to format. required Returns: Type Description ChatType The OpenAI chat-like format of the input. Source code in src/distilabel/steps/tasks/generate_embeddings.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n can be in `ChatType` format or a string. If a string, it will be converted to a\n list of dictionaries in OpenAI chat-like format.\n\n Args:\n input: The input to format.\n\n Returns:\n The OpenAI chat-like format of the input.\n \"\"\"\n text = input[\"text\"] = input[\"text\"]\n\n # input is in `ChatType` format\n if isinstance(text, str):\n return [{\"role\": \"user\", \"content\": text}]\n\n if is_openai_format(text):\n return text\n\n raise DistilabelUserError(\n f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n page=\"components-gallery/tasks/generateembeddings/\",\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.process","title":"process(inputs) ","text":"Generates an embedding for each input using the last hidden state of the LLM . Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/generate_embeddings.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n formatted_inputs = [self.format_input(input) for input in inputs]\n last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n for input, hidden_state in zip(inputs, last_hidden_states):\n input[\"embedding\"] = hidden_state[-1].tolist()\n input[\"model_name\"] = self.llm.model_name\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct","title":"Genstruct ","text":" Bases: Task Generate a pair of instruction-response from a document using an LLM . Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper. Note The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - title (
str ): The title of the document. - content (
str ): The content of the document. Output columns - user (
str ): The user's instruction based on the document. - assistant (
str ): The assistant's response based on the user's instruction. - model_name (
str ): The model name used to generate the feedback and result . Categories - text-generation
- instruction
- response
References - Genstruct 7B by Nous Research
- Ada-Instruct: Adapting Instruction Generators for Complex Reasoning
Examples: Generate instructions from raw documents using the title and content: from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n)\n\ngenstruct.load()\n\nresult = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'title': 'An instruction',\n# 'content': 'content of the document',\n# 'model_name': 'test',\n# 'user': 'An instruction',\n# 'assistant': 'content of the document',\n# }\n# ]\n Citations @misc{cui2023adainstructadaptinginstructiongenerators,\n title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n author={Wanyun Cui and Qianle Wang},\n year={2023},\n eprint={2310.04484},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.04484},\n}\n Source code in src/distilabel/steps/tasks/genstruct.py class Genstruct(Task):\n \"\"\"Generate a pair of instruction-response from a document using an `LLM`.\n\n `Genstruct` is a pre-defined task designed to generate valid instructions from a given raw document,\n with the title and the content, enabling the creation of new, partially synthetic instruction finetuning\n datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is\n inspired in the Ada-Instruct paper.\n\n Note:\n The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended\n option is to use `NousResearch/Genstruct-7B` as the LLM provided to the task, since it was trained\n for this specific task.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - title (`str`): The title of the document.\n - content (`str`): The content of the document.\n\n Output columns:\n - user (`str`): The user's instruction based on the document.\n - assistant (`str`): The assistant's response based on the user's instruction.\n - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n Categories:\n - text-generation\n - instruction\n - response\n\n References:\n - [Genstruct 7B by Nous Research](https://huggingface.co/NousResearch/Genstruct-7B)\n - [Ada-Instruct: Adapting Instruction Generators for Complex Reasoning](https://arxiv.org/abs/2310.04484)\n\n Examples:\n Generate instructions from raw documents using the title and content:\n\n ```python\n from distilabel.steps.tasks import Genstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n genstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n )\n\n genstruct.load()\n\n result = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'title': 'An instruction',\n # 'content': 'content of the document',\n # 'model_name': 'test',\n # 'user': 'An instruction',\n # 'assistant': 'content of the document',\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{cui2023adainstructadaptinginstructiongenerators,\n title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n author={Wanyun Cui and Qianle Wang},\n year={2023},\n eprint={2310.04484},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.04484},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"genstruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are the `title` and the `content`.\"\"\"\n return [\"title\", \"content\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n title=input[\"title\"], content=input[\"content\"]\n ),\n }\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `user` instruction based on the provided document\n and the `assistant` response based on the user's instruction.\"\"\"\n return [\"user\", \"assistant\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted so that both the user and the assistant messages are\n captured.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the keys `user` and `assistant` containing the content for each role.\n \"\"\"\n if output is None:\n return {\"user\": None, \"assistant\": None}\n\n matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n if not matches:\n return {\"user\": None, \"assistant\": None}\n\n return {\n \"user\": matches.group(1).strip(),\n \"assistant\": matches.group(2).strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are the title and the content . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.outputs","title":"outputs: List[str] property ","text":"The output for the task are the user instruction based on the provided document and the assistant response based on the user's instruction. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/genstruct.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"genstruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/genstruct.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n title=input[\"title\"], content=input[\"content\"]\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_output","title":"format_output(output, input) ","text":"The output is formatted so that both the user and the assistant messages are captured. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the keys user and assistant containing the content for each role. Source code in src/distilabel/steps/tasks/genstruct.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted so that both the user and the assistant messages are\n captured.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the keys `user` and `assistant` containing the content for each role.\n \"\"\"\n if output is None:\n return {\"user\": None, \"assistant\": None}\n\n matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n if not matches:\n return {\"user\": None, \"assistant\": None}\n\n return {\n \"user\": matches.group(1).strip(),\n \"assistant\": matches.group(2).strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator","title":"BitextRetrievalGenerator ","text":" Bases: _EmbeddingDataGenerator Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Attributes: Name Type Description source_language str The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. target_language str The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. unit Optional[Literal['sentence', 'phrase', 'passage']] The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['elementary school', 'high school', 'college']] The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. high_score Optional[Literal['4', '4.5', '5']] The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. low_score Optional[Literal['2.5', '3', '3.5']] The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['2.5', '3', '3.5']] The random seed to be set in case there's any sampling within the format_input method. Output columns - S1 (
str ): the first sentence generated by the LLM . - S2 (
str ): the second sentence generated by the LLM . - S3 (
str ): the third sentence generated by the LLM . - model_name (
str ): the name of the model used to generate the bitext retrieval data. Examples: Generate bitext retrieval data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class BitextRetrievalGenerator(_EmbeddingDataGenerator):\n \"\"\"Generate bitext retrieval data with an `LLM` to later on train an embedding model.\n\n `BitextRetrievalGenerator` is a `GeneratorTask` that generates bitext retrieval data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Attributes:\n source_language: The source language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n target_language: The target language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n Defaults to `None`, meaning that it will be randomly sampled.\n high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Output columns:\n - S1 (`str`): the first sentence generated by the `LLM`.\n - S2 (`str`): the second sentence generated by the `LLM`.\n - S3 (`str`): the third sentence generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the bitext retrieval\n data.\n\n Examples:\n Generate bitext retrieval data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import BitextRetrievalGenerator\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n ```\n \"\"\"\n\n source_language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n target_language: str = Field(\n default=...,\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n _template_name: str = PrivateAttr(default=\"bitext-retrieval\")\n _can_be_used_with_offline_batch_generation = True\n\n @property\n def prompt(self) -> ChatType:\n \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n being from the user with the content being the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n source_language=self.source_language,\n target_language=self.target_language,\n unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n difficulty=self.difficulty\n or random.choice([\"elementary school\", \"high school\", \"college\"]),\n high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n ).strip(),\n }\n ] # type: ignore\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"S1\", \"S2\", \"S3\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.prompt","title":"prompt: ChatType property ","text":"Contains the prompt to be used in the process method, rendering the _template ; and formatted as an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData","title":"GenerateLongTextMatchingData ","text":" Bases: _EmbeddingDataGeneration Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. seed str The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. Input columns - task (
str ): The task description to be used in the generation. Output columns - input (
str ): the input generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - model_name (
str ): the name of the model used to generate the long text matching data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic long text matching data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateLongTextMatchingData(_EmbeddingDataGeneration):\n \"\"\"Generate long text matching data with an `LLM` to later on train an embedding model.\n\n `GenerateLongTextMatchingData` is a `Task` that generates long text matching data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-matching-long\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-matching-long category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n Note that in this task the `seed` has no effect since there are no sampling params.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input (`str`): the input generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the long text matching\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic long text matching data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n _template_name: str = PrivateAttr(default=\"long-text-matching\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input\", \"positive_document\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData","title":"GenerateShortTextMatchingData ","text":" Bases: _EmbeddingDataGeneration Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. seed str The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. Input columns - task (
str ): The task description to be used in the generation. Output columns - input (
str ): the input generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - model_name (
str ): the name of the model used to generate the short text matching data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic short text matching data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateShortTextMatchingData(_EmbeddingDataGeneration):\n \"\"\"Generate short text matching data with an `LLM` to later on train an embedding model.\n\n `GenerateShortTextMatchingData` is a `Task` that generates short text matching data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-matching-short\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-matching-short category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n Note that in this task the `seed` has no effect since there are no sampling params.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input (`str`): the input generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the short text matching\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic short text matching data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n _template_name: str = PrivateAttr(default=\"short-text-matching\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input\", \"positive_document\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData","title":"GenerateTextClassificationData ","text":" Bases: _EmbeddingDataGeneration Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. difficulty Optional[Literal['high school', 'college', 'PhD']] The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The random seed to be set in case there's any sampling within the format_input method. Input columns - task (
str ): The task description to be used in the generation. Output columns - input_text (
str ): the input text generated by the LLM . - label (
str ): the label generated by the LLM . - misleading_label (
str ): the misleading label generated by the LLM . - model_name (
str ): the name of the model used to generate the text classification data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic text classification data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateTextClassificationData(_EmbeddingDataGeneration):\n \"\"\"Generate text classification data with an `LLM` to later on train an embedding model.\n\n `GenerateTextClassificationData` is a `Task` that generates text classification data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-classification\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-classification category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n Defaults to `None`, meaning that it will be randomly sampled.\n clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input_text (`str`): the input text generated by the `LLM`.\n - label (`str`): the label generated by the `LLM`.\n - misleading_label (`str`): the misleading label generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the text classification\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic text classification data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n clarity: Optional[\n Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ] = None\n\n _template_name: str = PrivateAttr(default=\"text-classification\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input_text\", \"label\", \"misleading_label\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData","title":"GenerateTextRetrievalData ","text":" Bases: _EmbeddingDataGeneration Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. query_type Optional[Literal['extremely long-tail', 'long-tail', 'common']] The type of query to be generated, which can be extremely long-tail , long-tail , or common . Defaults to None , meaning that it will be randomly sampled. query_length Optional[Literal['less than 5 words', '5 to 15 words', 'at least 10 words']] The length of the query to be generated, which can be less than 5 words , 5 to 15 words , or at least 10 words . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['high school', 'college', 'PhD']] The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. num_words Optional[Literal[50, 100, 200, 300, 400, 500]] The number of words in the query to be generated, which can be 50 , 100 , 200 , 300 , 400 , or 500 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal[50, 100, 200, 300, 400, 500]] The random seed to be set in case there's any sampling within the format_input method. Input columns - task (
str ): The task description to be used in the generation. Output columns - user_query (
str ): the user query generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - hard_negative_document (
str ): the hard negative document generated by the LLM . - model_name (
str ): the name of the model used to generate the text retrieval data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic text retrieval data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateTextRetrievalData(_EmbeddingDataGeneration):\n \"\"\"Generate text retrieval data with an `LLM` to later on train an embedding model.\n\n `GenerateTextRetrievalData` is a `Task` that generates text retrieval data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-retrieval\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-retrieval category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n query_type: The type of query to be generated, which can be `extremely long-tail`, `long-tail`,\n or `common`. Defaults to `None`, meaning that it will be randomly sampled.\n query_length: The length of the query to be generated, which can be `less than 5 words`, `5 to 15 words`,\n or `at least 10 words`. Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n Defaults to `None`, meaning that it will be randomly sampled.\n clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n num_words: The number of words in the query to be generated, which can be `50`, `100`, `200`, `300`, `400`, or `500`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - user_query (`str`): the user query generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - hard_negative_document (`str`): the hard negative document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the text retrieval data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic text retrieval data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n query_type: Optional[Literal[\"extremely long-tail\", \"long-tail\", \"common\"]] = None\n query_length: Optional[\n Literal[\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ] = None\n difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n clarity: Optional[\n Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ] = None\n num_words: Optional[Literal[50, 100, 200, 300, 400, 500]] = None\n\n _template_name: str = PrivateAttr(default=\"text-retrieval\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n query_type=self.query_type\n or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n query_length=self.query_length\n or random.choice(\n [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ),\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n num_words=self.num_words\n or random.choice([50, 100, 200, 300, 400, 500]),\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\n \"user_query\",\n \"positive_document\",\n \"hard_negative_document\",\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n query_type=self.query_type\n or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n query_length=self.query_length\n or random.choice(\n [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ),\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n num_words=self.num_words\n or random.choice([50, 100, 200, 300, 400, 500]),\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator","title":"MonolingualTripletGenerator ","text":" Bases: _EmbeddingDataGenerator Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. unit Optional[Literal['sentence', 'phrase', 'passage']] The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['elementary school', 'high school', 'college']] The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. high_score Optional[Literal['4', '4.5', '5']] The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. low_score Optional[Literal['2.5', '3', '3.5']] The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['2.5', '3', '3.5']] The random seed to be set in case there's any sampling within the format_input method. Output columns - S1 (
str ): the first sentence generated by the LLM . - S2 (
str ): the second sentence generated by the LLM . - S3 (
str ): the third sentence generated by the LLM . - model_name (
str ): the name of the model used to generate the monolingual triplets. Examples: Generate monolingual triplets for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class MonolingualTripletGenerator(_EmbeddingDataGenerator):\n \"\"\"Generate monolingual triplets with an `LLM` to later on train an embedding model.\n\n `MonolingualTripletGenerator` is a `GeneratorTask` that generates monolingual triplets with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n Defaults to `None`, meaning that it will be randomly sampled.\n high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Output columns:\n - S1 (`str`): the first sentence generated by the `LLM`.\n - S2 (`str`): the second sentence generated by the `LLM`.\n - S3 (`str`): the third sentence generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the monolingual triplets.\n\n Examples:\n Generate monolingual triplets for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import MonolingualTripletGenerator\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n _template_name: str = PrivateAttr(default=\"monolingual-triplet\")\n _can_be_used_with_offline_batch_generation = True\n\n @property\n def prompt(self) -> ChatType:\n \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n being from the user with the content being the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n language=self.language,\n unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n difficulty=self.difficulty\n or random.choice([\"elementary school\", \"high school\", \"college\"]),\n high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n ).strip(),\n }\n ] # type: ignore\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"S1\", \"S2\", \"S3\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.prompt","title":"prompt: ChatType property ","text":"Contains the prompt to be used in the process method, rendering the _template ; and formatted as an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation","title":"InstructionBacktranslation ","text":" Bases: Task Self-Alignment with Instruction Backtranslation. Attributes: Name Type Description _template Optional[Template] the Jinja2 template to use for the Instruction Backtranslation task. Input columns - instruction (
str ): The reference instruction to evaluate the text output. - generation (
str ): The text output to evaluate for the given instruction. Output columns - score (
str ): The score for the generation based on the given instruction. - reason (
str ): The reason for the provided score. - model_name (
str ): The model name used to score the generation. Categories References Self-Alignment with Instruction Backtranslation Examples: Generate a score and reason for a given instruction and generation: from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\ninstruction_backtranslation.load()\n\nresult = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"How much is 2+2?\",\n# \"generation\": \"4\",\n# \"score\": 3,\n# \"reason\": \"Reason for the generation.\",\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n# }\n# ]\n Citations @misc{li2024selfalignmentinstructionbacktranslation,\n title={Self-Alignment with Instruction Backtranslation},\n author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n year={2024},\n eprint={2308.06259},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2308.06259},\n}\n Source code in src/distilabel/steps/tasks/instruction_backtranslation.py class InstructionBacktranslation(Task):\n \"\"\"Self-Alignment with Instruction Backtranslation.\n\n Attributes:\n _template: the Jinja2 template to use for the Instruction Backtranslation task.\n\n Input columns:\n - instruction (`str`): The reference instruction to evaluate the text output.\n - generation (`str`): The text output to evaluate for the given instruction.\n\n Output columns:\n - score (`str`): The score for the generation based on the given instruction.\n - reason (`str`): The reason for the provided score.\n - model_name (`str`): The model name used to score the generation.\n\n Categories:\n - critique\n\n References:\n - [`Self-Alignment with Instruction Backtranslation`](https://arxiv.org/abs/2308.06259)\n\n Examples:\n Generate a score and reason for a given instruction and generation:\n\n ```python\n from distilabel.steps.tasks import InstructionBacktranslation\n\n instruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\n instruction_backtranslation.load()\n\n result = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"How much is 2+2?\",\n # \"generation\": \"4\",\n # \"score\": 3,\n # \"reason\": \"Reason for the generation.\",\n # \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{li2024selfalignmentinstructionbacktranslation,\n title={Self-Alignment with Instruction Backtranslation},\n author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n year={2024},\n eprint={2308.06259},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2308.06259},\n }\n ```\n \"\"\"\n\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"instruction-backtranslation.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`, and the `generation` for it.\"\"\"\n return [\"instruction\", \"generation\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generation=input[\"generation\"]\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `score`, `reason` and the `model_name`.\"\"\"\n return [\"score\", \"reason\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n `model_name` will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing the `score` and the `reason` for the provided `score`.\n \"\"\"\n pattern = r\"(.+?)Score: (\\d)\"\n\n matches = None\n if output is not None:\n matches = re.findall(pattern, output, re.DOTALL)\n if matches is None:\n return {\"score\": None, \"reason\": None}\n\n return {\n \"score\": int(matches[0][1]),\n \"reason\": matches[0][0].strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction , and the generation for it. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.outputs","title":"outputs: List[str] property ","text":"The output for the task is the score , reason and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"instruction-backtranslation.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generation=input[\"generation\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dictionary with the score and reason . The model_name will be automatically included within the process method of Task . Parameters: Name Type Description Default output Union[str, None] a string representing the output of the LLM via the process method. required input Dict[str, Any] the input to the task, as required by some tasks to format the output. required Returns: Type Description Dict[str, Any] A dictionary containing the score and the reason for the provided score . Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n `model_name` will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing the `score` and the `reason` for the provided `score`.\n \"\"\"\n pattern = r\"(.+?)Score: (\\d)\"\n\n matches = None\n if output is not None:\n matches = re.findall(pattern, output, re.DOTALL)\n if matches is None:\n return {\"score\": None, \"reason\": None}\n\n return {\n \"score\": int(matches[0][1]),\n \"reason\": matches[0][0].strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie","title":"Magpie ","text":" Bases: Task , MagpieBase Generates conversations using an instruct fine-tuned LLM. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. Attributes: Name Type Description n_turns the number of turns that the generated conversation will have. Defaults to 1 . end_with_user whether the conversation should end with a user message. Defaults to False . include_system_prompt whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . Runtime parameters n_turns : the number of turns that the generated conversation will have. Defaults to 1 . end_with_user : whether the conversation should end with a user message. Defaults to False . include_system_prompt : whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction : whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt : an optional system prompt or list of system prompts that can be used to steer the LLM to generate content of certain topic, guide the style, etc. If it's a list of system prompts, then a random system prompt will be chosen per input/output batch. If the provided inputs contains a system_prompt column, then this runtime parameter will be ignored and the one from the column will be used. Defaults to None . system_prompt : an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Input columns - system_prompt (
str , optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic. Output columns - conversation (
ChatType ): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False . - instruction (
str ): the generated instructions if only_instruction=True or n_turns==1 . - response (
str ): the generated response if n_turns==1 . - system_prompt_key (
str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. - model_name (
str ): The model name used to generate the conversation or instruction . Categories - text-generation
- instruction
References - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
Examples: Generating instructions with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n# {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n Generating conversations with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n# {\n# 'role': 'user',\n# 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n# {\n# 'role': 'user',\n# 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n# }\n# ]\n# }\n# ]\n Source code in src/distilabel/steps/tasks/magpie/base.py class Magpie(Task, MagpieBase):\n \"\"\"Generates conversations using an instruct fine-tuned LLM.\n\n Magpie is a neat method that allows generating user instructions with no seed data\n or specific system prompt thanks to the autoregressive capabilities of the instruct\n fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n LLM without any user message, then the LLM will continue generating tokens as if it was\n the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n After this instruct is generated, it can be sent again to the LLM to generate this time\n an assistant response. This process can be repeated N times allowing to build a multi-turn\n conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n Scratch by Prompting Aligned LLMs with Nothing'.\n\n Attributes:\n n_turns: the number of turns that the generated conversation will have.\n Defaults to `1`.\n end_with_user: whether the conversation should end with a user message.\n Defaults to `False`.\n include_system_prompt: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n only_instruction: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n system_prompt: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic. Defaults to `None`.\n\n Runtime parameters:\n - `n_turns`: the number of turns that the generated conversation will have. Defaults\n to `1`.\n - `end_with_user`: whether the conversation should end with a user message.\n Defaults to `False`.\n - `include_system_prompt`: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n - `only_instruction`: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n - `system_prompt`: an optional system prompt or list of system prompts that can\n be used to steer the LLM to generate content of certain topic, guide the style,\n etc. If it's a list of system prompts, then a random system prompt will be chosen\n per input/output batch. If the provided inputs contains a `system_prompt` column,\n then this runtime parameter will be ignored and the one from the column will\n be used. Defaults to `None`.\n - `system_prompt`: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic.\n\n Input columns:\n - system_prompt (`str`, optional): an optional system prompt that can be provided\n to guide the generation of the instruct LLM and steer it to generate instructions\n of certain topic.\n\n Output columns:\n - conversation (`ChatType`): the generated conversation which is a list of chat\n items with a role and a message. Only if `only_instruction=False`.\n - instruction (`str`): the generated instructions if `only_instruction=True` or `n_turns==1`.\n - response (`str`): the generated response if `n_turns==1`.\n - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n the conversation or instruction. Only if `system_prompt` is a dictionary.\n - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n Categories:\n - text-generation\n - instruction\n\n References:\n - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n Examples:\n Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import Magpie\n\n magpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n )\n\n magpie.load()\n\n result = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n )\n # [\n # {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n # {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n # ]\n ```\n\n Generating conversations with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import Magpie\n\n magpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n )\n\n magpie.load()\n\n result = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n )\n # [\n # {\n # 'conversation': [\n # {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n # {\n # 'role': 'user',\n # 'content': 'I\\'m having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n # of x\".'\n # },\n # {\n # 'role': 'assistant',\n # 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don\\'t worry, I\\'m here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n # x\". What it\\'s asking us to do is find the'\n # }\n # ]\n # },\n # {\n # 'conversation': [\n # {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n # {\n # 'role': 'user',\n # 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n # might be pests or diseases, but I'm not sure which.\"\n # },\n # {\n # 'role': 'assistant',\n # 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n # **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n # }\n # ]\n # }\n # ]\n ```\n \"\"\"\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpie/\",\n )\n\n self.llm.use_magpie_template = True\n\n @property\n def inputs(self) -> \"StepColumns\":\n return {\"system_prompt\": False}\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Does nothing.\"\"\"\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n outputs = []\n\n if self.only_instruction:\n outputs.append(\"instruction\")\n elif self.n_turns == 1:\n outputs.extend([\"instruction\", \"response\"])\n else:\n outputs.append(\"conversation\")\n\n if isinstance(self.system_prompt, dict):\n outputs.append(\"system_prompt_key\")\n\n outputs.append(\"model_name\")\n\n return outputs\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n Args:\n inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n Yields:\n The list of generated conversations.\n \"\"\"\n yield self._generate_with_pre_query_template(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.outputs","title":"outputs: StepColumns property ","text":"Either a multi-turn conversation or the instruction generated. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.model_post_init","title":"model_post_init(__context) ","text":"Checks that the provided LLM uses the MagpieChatTemplateMixin . Source code in src/distilabel/steps/tasks/magpie/base.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpie/\",\n )\n\n self.llm.use_magpie_template = True\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_input","title":"format_input(input) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/base.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Does nothing.\"\"\"\n return []\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/base.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.process","title":"process(inputs) ","text":"Generate a list of instructions or conversations of the specified number of turns. Parameters: Name Type Description Default inputs StepInput a list of dictionaries that can contain a system_prompt key. required Yields: Type Description StepOutput The list of generated conversations. Source code in src/distilabel/steps/tasks/magpie/base.py def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n Args:\n inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n Yields:\n The list of generated conversations.\n \"\"\"\n yield self._generate_with_pre_query_template(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator","title":"MagpieGenerator ","text":" Bases: GeneratorTask , MagpieBase Generator task the generates instructions or conversations using Magpie. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. Attributes: Name Type Description n_turns the number of turns that the generated conversation will have. Defaults to 1 . end_with_user whether the conversation should end with a user message. Defaults to False . include_system_prompt whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . num_rows RuntimeParameter[int] the number of rows to be generated. Runtime parameters n_turns : the number of turns that the generated conversation will have. Defaults to 1 . end_with_user : whether the conversation should end with a user message. Defaults to False . include_system_prompt : whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction : whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt : an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. num_rows : the number of rows to be generated. Output columns - conversation (
ChatType ): the generated conversation which is a list of chat items with a role and a message. - instruction (
str ): the generated instructions if only_instruction=True . - response (
str ): the generated response if n_turns==1 . - system_prompt_key (
str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. - model_name (
str ): The model name used to generate the conversation or instruction . Categories - text-generation
- instruction
- generator
References - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
Examples: Generating instructions with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n# {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n# ],\n# True\n# )\n Generating a conversation with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n# {\n# 'role': 'assistant',\n# 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n# },\n# {\n# 'role': 'user',\n# 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n# {\n# 'role': 'assistant',\n# 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n# },\n# {\n# 'role': 'user',\n# 'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n# }\n# ]\n# }\n# ],\n# True\n# )\n Generating with system prompts with probabilities: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n Citations @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n}\n Source code in src/distilabel/steps/tasks/magpie/generator.py class MagpieGenerator(GeneratorTask, MagpieBase):\n \"\"\"Generator task the generates instructions or conversations using Magpie.\n\n Magpie is a neat method that allows generating user instructions with no seed data\n or specific system prompt thanks to the autoregressive capabilities of the instruct\n fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n LLM without any user message, then the LLM will continue generating tokens as it was\n the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n After this instruct is generated, it can be sent again to the LLM to generate this time\n an assistant response. This process can be repeated N times allowing to build a multi-turn\n conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n Scratch by Prompting Aligned LLMs with Nothing'.\n\n Attributes:\n n_turns: the number of turns that the generated conversation will have.\n Defaults to `1`.\n end_with_user: whether the conversation should end with a user message.\n Defaults to `False`.\n include_system_prompt: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n only_instruction: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n system_prompt: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic. Defaults to `None`.\n num_rows: the number of rows to be generated.\n\n Runtime parameters:\n - `n_turns`: the number of turns that the generated conversation will have. Defaults\n to `1`.\n - `end_with_user`: whether the conversation should end with a user message.\n Defaults to `False`.\n - `include_system_prompt`: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n - `only_instruction`: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n - `system_prompt`: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic.\n - `num_rows`: the number of rows to be generated.\n\n Output columns:\n - conversation (`ChatType`): the generated conversation which is a list of chat\n items with a role and a message.\n - instruction (`str`): the generated instructions if `only_instruction=True`.\n - response (`str`): the generated response if `n_turns==1`.\n - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n the conversation or instruction. Only if `system_prompt` is a dictionary.\n - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n Categories:\n - text-generation\n - instruction\n - generator\n\n References:\n - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n Examples:\n Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n generator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n )\n\n generator.load()\n\n result = next(generator.process())\n # (\n # [\n # {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n # {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n # ],\n # True\n # )\n ```\n\n Generating a conversation with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n generator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n )\n\n generator.load()\n\n result = next(generator.process())\n # (\n # [\n # {\n # 'conversation': [\n # {\n # 'role': 'system',\n # 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n # insightful responses to help the user with their queries.'\n # },\n # {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n # {\n # 'role': 'assistant',\n # 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n # let's break down the basics. First, we need to identify your goals and target audience. What do\"\n # },\n # {\n # 'role': 'user',\n # 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n # expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n # },\n # {\n # 'role': 'assistant',\n # 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n # agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n # }\n # ]\n # },\n # {\n # 'conversation': [\n # {\n # 'role': 'system',\n # 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n # insightful responses to help the user with their queries.'\n # },\n # {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n # {\n # 'role': 'assistant',\n # 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n # **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n # },\n # {\n # 'role': 'user',\n # 'content': 'Let me stop you there. Let\\'s explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I\\'re primarily using my\n # laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n # },\n # {\n # 'role': 'assistant',\n # 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n # option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n # }\n # ]\n # }\n # ],\n # True\n # )\n ```\n\n Generating with system prompts with probabilities:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n magpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n )\n\n magpie.load()\n\n result = next(magpie.process())\n ```\n\n Citations:\n ```\n @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n }\n ```\n \"\"\"\n\n # TODO: move this to `GeneratorTask`\n num_rows: RuntimeParameter[int] = Field(\n default=None, description=\"The number of rows to generate.\"\n )\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpiegenerator/\",\n )\n\n self.llm.use_magpie_template = True\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n outputs = []\n\n if self.only_instruction:\n outputs.append(\"instruction\")\n elif self.n_turns == 1:\n outputs.extend([\"instruction\", \"response\"])\n else:\n outputs.append(\"conversation\")\n\n if isinstance(self.system_prompt, dict):\n outputs.append(\"system_prompt_key\")\n\n outputs.append(\"model_name\")\n\n return outputs\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n The generated instructions or conversations.\n \"\"\"\n generated = offset\n\n while generated <= self.num_rows: # type: ignore\n rows_to_generate = (\n self.num_rows if self.num_rows < self.batch_size else self.batch_size # type: ignore\n )\n conversations = self._generate_with_pre_query_template(\n inputs=[{} for _ in range(rows_to_generate)] # type: ignore\n )\n generated += rows_to_generate # type: ignore\n yield (conversations, generated == self.num_rows)\n\n @override\n def _sample_input(self) -> \"ChatType\":\n return self._generate_with_pre_query_template(inputs=[{}])\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.outputs","title":"outputs: StepColumns property ","text":"Either a multi-turn conversation or the instruction generated. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.model_post_init","title":"model_post_init(__context) ","text":"Checks that the provided LLM uses the MagpieChatTemplateMixin . Source code in src/distilabel/steps/tasks/magpie/generator.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpiegenerator/\",\n )\n\n self.llm.use_magpie_template = True\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/generator.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.process","title":"process(offset=0) ","text":"Generates the desired number of instructions or conversations using Magpie. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput The generated instructions or conversations. Source code in src/distilabel/steps/tasks/magpie/generator.py def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n The generated instructions or conversations.\n \"\"\"\n generated = offset\n\n while generated <= self.num_rows: # type: ignore\n rows_to_generate = (\n self.num_rows if self.num_rows < self.batch_size else self.batch_size # type: ignore\n )\n conversations = self._generate_with_pre_query_template(\n inputs=[{} for _ in range(rows_to_generate)] # type: ignore\n )\n generated += rows_to_generate # type: ignore\n yield (conversations, generated == self.num_rows)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter","title":"MathShepherdCompleter ","text":" Bases: Task Math Shepherd Completer and auto-labeller task. This task is in charge of, given a list of solutions to an instruction, and a golden solution, as reference, generate completions for the solutions, and label them according to the golden solution using the hard estimation method from figure 2 in the reference paper, Eq. 3. The attributes make the task flexible to be used with different types of dataset and LLMs, and allow making use of different fields to modify the system and user prompts for it. Before modifying them, review the current defaults to ensure the completions are generated correctly. Attributes: Name Type Description system_prompt Optional[str] The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. extra_rules Optional[str] This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. few_shots Optional[str] Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. N PositiveInt Number of completions to generate for each step, correspond to N in the paper. They used 8 in the paper, but it can be adjusted. tags list[str] List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the paper, where the first is used as a positive label, and the second as a negative one. This can be updated, but it MUST be a list with 2 elements, where the first is the positive one, and the second the negative one. Input columns - instruction (
str ): The task or instruction. - solutions (
List[str] ): List of solutions to the task. - golden_solution (
str ): The reference solution to the task, will be used to annotate the candidate solutions. Output columns - solutions (
List[str] ): The same columns that were used as input, the \"solutions\" is modified. - model_name (
str ): The name of the model used to generate the revision. Categories - text-generation
- labelling
References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations Examples: Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way): from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Annotate your steps with the Math Shepherd Completer: from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Citations: ```\n@misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n}\n```\n Source code in src/distilabel/steps/tasks/math_shepherd/completer.py class MathShepherdCompleter(Task):\n \"\"\"Math Shepherd Completer and auto-labeller task.\n\n This task is in charge of, given a list of solutions to an instruction, and a golden solution,\n as reference, generate completions for the solutions, and label them according to the golden\n solution using the hard estimation method from figure 2 in the reference paper, Eq. 3.\n The attributes make the task flexible to be used with different types of dataset and LLMs, and\n allow making use of different fields to modify the system and user prompts for it. Before modifying\n them, review the current defaults to ensure the completions are generated correctly.\n\n Attributes:\n system_prompt: The system prompt to be used in the completions. The default one has been\n checked and generates good completions using Llama 3.1 with 8B and 70B,\n but it can be modified to adapt it to the model and dataset selected.\n extra_rules: This field can be used to insert extra rules relevant to the type of dataset.\n For example, in the original paper they used GSM8K and MATH datasets, and this field\n can be used to insert the rules for the GSM8K dataset.\n few_shots: Few shots to help the model generating the completions, write them in the\n format of the type of solutions wanted for your dataset.\n N: Number of completions to generate for each step, correspond to N in the paper.\n They used 8 in the paper, but it can be adjusted.\n tags: List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the\n paper, where the first is used as a positive label, and the second as a negative one.\n This can be updated, but it MUST be a list with 2 elements, where the first is the\n positive one, and the second the negative one.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n - solutions (`List[str]`): List of solutions to the task.\n - golden_solution (`str`): The reference solution to the task, will be used\n to annotate the candidate solutions.\n\n Output columns:\n - solutions (`List[str]`): The same columns that were used as input, the \"solutions\" is modified.\n - model_name (`str`): The name of the model used to generate the revision.\n\n Categories:\n - text-generation\n - labelling\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n\n Examples:\n Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way):\n\n ```python\n from distilabel.steps.tasks import MathShepherdCompleter\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"],\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n\n Annotate your steps with the Math Shepherd Completer:\n\n ```python\n from distilabel.steps.tasks import MathShepherdCompleter\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdCompleter(\n llm=llm,\n N=3\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"],\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n\n Citations:\n\n ```\n @misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n }\n ```\n \"\"\"\n\n system_prompt: Optional[str] = SYSTEM_PROMPT\n extra_rules: Optional[str] = RULES_GSM8K\n few_shots: Optional[str] = FEW_SHOTS_GSM8K\n N: PositiveInt = 1\n tags: list[str] = [\"+\", \"-\"]\n\n def load(self) -> None:\n super().load()\n\n if self.system_prompt is not None:\n self.system_prompt = Template(self.system_prompt).render(\n extra_rules=self.extra_rules or \"\",\n few_shots=self.few_shots or \"\",\n structured_prompt=SYSTEM_PROMPT_STRUCTURED\n if self.use_default_structured_output\n else \"\",\n )\n if self.use_default_structured_output:\n self._template = Template(TEMPLATE_STRUCTURED)\n else:\n self._template = Template(TEMPLATE)\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\", \"solutions\", \"golden_solution\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n instruction=input[\"instruction\"], N=self.N\n ),\n }\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages # type: ignore\n\n def _parse_output(self, output: Union[str, None]) -> list[list[str]]:\n if output is None:\n return [[\"\"]] * self.N\n\n if self.N > 1:\n output_transformed = ( # type: ignore\n self._format_structured_output(output)\n if self.use_default_structured_output\n else output.split(\"---\")\n )\n examples = [split_solution_steps(o) for o in output_transformed]\n # In case there aren't the expected number of completions, we fill it with \"\", or short the list.\n # This shoulnd't happen if the LLM works as expected, but it's a safety measure as it can be\n # difficult to debug if the completions don't match the solutions.\n if len(examples) < self.N:\n examples.extend([\"\"] * (self.N - len(examples))) # type: ignore\n elif len(examples) > self.N:\n examples = examples[: self.N]\n else:\n output_transformed = (\n self._format_structured_output(output)[0]\n if self.use_default_structured_output\n else output\n )\n examples = [split_solution_steps(output_transformed)]\n return examples\n\n def _format_structured_output(self, output: str) -> list[str]:\n default_output = [\"\"] * self.N if self.N else [\"\"]\n if parsed_output := parse_json_response(output):\n solutions = parsed_output[\"solutions\"]\n extracted_solutions = [solution[\"solution\"] for solution in solutions]\n if len(output) != self.N:\n extracted_solutions = default_output\n return extracted_solutions\n return default_output\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Does the processing of generation completions for the solutions, and annotate\n each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)).\n\n Args:\n inputs: Inputs to the step\n\n Yields:\n Annotated inputs with the completions.\n \"\"\"\n\n # A list with all the inputs to be passed to the LLM. Needs another structure to\n # find them afterwards\n prepared_inputs = []\n # Data structure with the indices of the elements.\n # (i, j, k) where i is the input, j is the solution, and k is the completion\n input_positions = []\n golden_answers = []\n for i, input in enumerate(inputs):\n instruction = input[\"instruction\"]\n golden_solution = input[\"golden_solution\"] # This is a single solution\n golden_answers.append(golden_solution[-1])\n # This contains a list of solutions\n solutions = input[\"solutions\"]\n for j, solution in enumerate(solutions):\n # For each solution, that has K steps, we have to generate N completions\n # for the first K-2 steps (-2 because the last 2 steps are the last step, and\n # the answer itself, which can be directly compared against golden answer)\n prepared_completions = self._prepare_completions(instruction, solution)\n prepared_inputs.extend(prepared_completions)\n input_positions.extend(\n [(i, j, k) for k in range(len(prepared_completions))]\n )\n\n # Send the elements in batches to the LLM to speed up the process\n final_outputs = []\n # Added here to simplify testing in case we don't have anything to process\n # TODO: Ensure the statistics has the same shape as all the outputs, raw_outputs, and raw_inputs\n statistics = []\n total_raw_outputs = []\n total_raw_inputs = []\n for inner_batch in batched(prepared_inputs, self.input_batch_size): # type: ignore\n outputs = self.llm.generate_outputs(\n inputs=inner_batch,\n num_generations=1,\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n\n formatted_outputs = []\n stats = []\n raw_outputs = []\n raw_inputs = []\n for i, output in enumerate(outputs):\n generation = output[\"generations\"][0]\n raw_inputs.append(inner_batch[i])\n raw_outputs.append(generation or \"\")\n formatted_outputs.append(self._parse_output(generation))\n stats.append(output[\"statistics\"])\n\n final_outputs.extend(formatted_outputs)\n statistics.extend(stats)\n total_raw_outputs.extend(raw_outputs)\n total_raw_inputs.extend(raw_inputs)\n\n yield self._auto_label( # type: ignore\n inputs,\n final_outputs,\n input_positions,\n golden_answers,\n statistics,\n total_raw_outputs,\n total_raw_inputs,\n )\n\n def _prepare_completions(\n self, instruction: str, steps: list[str]\n ) -> List[\"ChatType\"]:\n \"\"\"Helper method to create, given a solution (a list of steps), and a instruction, the\n texts to be completed by the LLM.\n\n Args:\n instruction: Instruction of the problem.\n steps: List of steps that are part of the solution.\n\n Returns:\n List of ChatType, where each ChatType is the prompt corresponding to one of the steps\n to be completed.\n \"\"\"\n prepared_inputs = []\n # Use the number of completions that correspond to a given instruction/steps pair\n # to find afterwards the input that corresponds to a given completion (to do the labelling)\n num_completions = len(steps[:-2])\n for i in range(1, num_completions + 1):\n to_complete = instruction + \" \" + \"\\n\".join(steps[:i])\n prepared_inputs.append(self.format_input({\"instruction\": to_complete}))\n\n return prepared_inputs\n\n def _auto_label(\n self,\n inputs: StepInput,\n final_outputs: list[Completions],\n input_positions: list[tuple[int, int, int]],\n golden_answers: list[str],\n statistics: list[\"LLMStatistics\"],\n raw_outputs: list[str],\n raw_inputs: list[str],\n ) -> StepInput:\n \"\"\"Labels the steps inplace (in the inputs), and returns the inputs.\n\n Args:\n inputs: The original inputs\n final_outputs: List of generations from the LLM.\n It's organized as a list where the elements sent to the LLM are\n grouped together, then each element contains the completions, and\n each completion is a list of steps.\n input_positions: A list with tuples generated in the process method\n that contains (i, j, k) where i is the index of the input, j is the\n index of the solution, and k is the index of the completion.\n golden_answers: List of golden answers for each input.\n statistics: List of statistics from the LLM.\n raw_outputs: List of raw outputs from the LLM.\n raw_inputs: List of raw inputs to the LLM.\n\n Returns:\n Inputs annotated.\n \"\"\"\n for i, (instruction_i, solution_i, step_i) in enumerate(input_positions):\n input = inputs[instruction_i]\n solutions = input[\"solutions\"]\n n_completions = final_outputs[i]\n label = f\" {self.tags[1]}\"\n for completion in n_completions:\n if len(completion) == 0:\n # This can be a failed generation\n label = \"\" # Everyting stays the same\n self._logger.info(\"Completer failed due to empty completion\")\n continue\n if completion[-1] == golden_answers[instruction_i]:\n label = f\" { self.tags[0]}\"\n # If we found one, it's enough as we are doing Hard Estimation\n continue\n # In case we had no solutions from the previous step, otherwise we would have\n # an IndexError\n if not solutions[solution_i]:\n continue\n solutions[solution_i][step_i] += label\n inputs[instruction_i][\"solutions\"] = solutions\n\n for i, input in enumerate(inputs):\n solutions = input[\"solutions\"]\n new_solutions = []\n for solution in solutions:\n if not solution or (len(solution) == 1):\n # The generation may fail to generate the expected\n # completions, or just added an extra empty completion,\n # we skip it.\n # Other possible error is having a list of solutions\n # with a single item, so when we call .pop, we are left\n # with an empty list, so we skip it too.\n new_solutions.append(solution)\n continue\n\n answer = solution.pop()\n label = (\n f\" {self.tags[0]}\"\n if answer == golden_answers[i]\n else f\" {self.tags[1]}\"\n )\n solution[-1] += \" \" + answer + label\n new_solutions.append(solution)\n\n # Only add the solutions if the data was properly parsed\n input[\"solutions\"] = new_solutions if new_solutions else input[\"solutions\"]\n input = self._add_metadata(\n input, statistics[i], raw_outputs[i], raw_inputs[i]\n )\n\n return inputs\n\n def _add_metadata(\n self,\n input: dict[str, Any],\n statistics: list[\"LLMStatistics\"],\n raw_output: Union[str, None],\n raw_input: Union[list[dict[str, Any]], None],\n ) -> dict[str, Any]:\n \"\"\"Adds the `distilabel_metadata` to the input.\n\n This method comes for free in the general Tasks, but as we have reimplemented the `process`,\n we have to repeat it here.\n\n Args:\n input: The input to add the metadata to.\n statistics: The statistics from the LLM.\n raw_output: The raw output from the LLM.\n raw_input: The raw input to the LLM.\n\n Returns:\n The input with the metadata added if applies.\n \"\"\"\n input[\"model_name\"] = self.llm.model_name\n\n if DISTILABEL_METADATA_KEY not in input:\n input[DISTILABEL_METADATA_KEY] = {}\n # If the solutions are splitted afterwards, the statistics should be splitted\n # to avoid counting extra tokens\n input[DISTILABEL_METADATA_KEY][f\"statistics_{self.name}\"] = statistics\n\n # Let some defaults in case something failed and we had None, otherwise when reading\n # the parquet files using pyarrow, the following error will appear:\n # ArrowInvalid: Schema\n if self.add_raw_input:\n input[DISTILABEL_METADATA_KEY][f\"raw_input_{self.name}\"] = raw_input or [\n {\"content\": \"\", \"role\": \"\"}\n ]\n if self.add_raw_output:\n input[DISTILABEL_METADATA_KEY][f\"raw_output_{self.name}\"] = raw_output or \"\"\n return input\n\n @override\n def get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdCompleter.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.process","title":"process(inputs) ","text":"Does the processing of generation completions for the solutions, and annotate each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)). Parameters: Name Type Description Default inputs StepInput Inputs to the step required Yields: Type Description StepOutput Annotated inputs with the completions. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Does the processing of generation completions for the solutions, and annotate\n each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)).\n\n Args:\n inputs: Inputs to the step\n\n Yields:\n Annotated inputs with the completions.\n \"\"\"\n\n # A list with all the inputs to be passed to the LLM. Needs another structure to\n # find them afterwards\n prepared_inputs = []\n # Data structure with the indices of the elements.\n # (i, j, k) where i is the input, j is the solution, and k is the completion\n input_positions = []\n golden_answers = []\n for i, input in enumerate(inputs):\n instruction = input[\"instruction\"]\n golden_solution = input[\"golden_solution\"] # This is a single solution\n golden_answers.append(golden_solution[-1])\n # This contains a list of solutions\n solutions = input[\"solutions\"]\n for j, solution in enumerate(solutions):\n # For each solution, that has K steps, we have to generate N completions\n # for the first K-2 steps (-2 because the last 2 steps are the last step, and\n # the answer itself, which can be directly compared against golden answer)\n prepared_completions = self._prepare_completions(instruction, solution)\n prepared_inputs.extend(prepared_completions)\n input_positions.extend(\n [(i, j, k) for k in range(len(prepared_completions))]\n )\n\n # Send the elements in batches to the LLM to speed up the process\n final_outputs = []\n # Added here to simplify testing in case we don't have anything to process\n # TODO: Ensure the statistics has the same shape as all the outputs, raw_outputs, and raw_inputs\n statistics = []\n total_raw_outputs = []\n total_raw_inputs = []\n for inner_batch in batched(prepared_inputs, self.input_batch_size): # type: ignore\n outputs = self.llm.generate_outputs(\n inputs=inner_batch,\n num_generations=1,\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n\n formatted_outputs = []\n stats = []\n raw_outputs = []\n raw_inputs = []\n for i, output in enumerate(outputs):\n generation = output[\"generations\"][0]\n raw_inputs.append(inner_batch[i])\n raw_outputs.append(generation or \"\")\n formatted_outputs.append(self._parse_output(generation))\n stats.append(output[\"statistics\"])\n\n final_outputs.extend(formatted_outputs)\n statistics.extend(stats)\n total_raw_outputs.extend(raw_outputs)\n total_raw_inputs.extend(raw_inputs)\n\n yield self._auto_label( # type: ignore\n inputs,\n final_outputs,\n input_positions,\n golden_answers,\n statistics,\n total_raw_outputs,\n total_raw_inputs,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._prepare_completions","title":"_prepare_completions(instruction, steps) ","text":"Helper method to create, given a solution (a list of steps), and a instruction, the texts to be completed by the LLM. Parameters: Name Type Description Default instruction str Instruction of the problem. required steps list[str] List of steps that are part of the solution. required Returns: Type Description List[ChatType] List of ChatType, where each ChatType is the prompt corresponding to one of the steps List[ChatType] to be completed. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _prepare_completions(\n self, instruction: str, steps: list[str]\n) -> List[\"ChatType\"]:\n \"\"\"Helper method to create, given a solution (a list of steps), and a instruction, the\n texts to be completed by the LLM.\n\n Args:\n instruction: Instruction of the problem.\n steps: List of steps that are part of the solution.\n\n Returns:\n List of ChatType, where each ChatType is the prompt corresponding to one of the steps\n to be completed.\n \"\"\"\n prepared_inputs = []\n # Use the number of completions that correspond to a given instruction/steps pair\n # to find afterwards the input that corresponds to a given completion (to do the labelling)\n num_completions = len(steps[:-2])\n for i in range(1, num_completions + 1):\n to_complete = instruction + \" \" + \"\\n\".join(steps[:i])\n prepared_inputs.append(self.format_input({\"instruction\": to_complete}))\n\n return prepared_inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._auto_label","title":"_auto_label(inputs, final_outputs, input_positions, golden_answers, statistics, raw_outputs, raw_inputs) ","text":"Labels the steps inplace (in the inputs), and returns the inputs. Parameters: Name Type Description Default inputs StepInput The original inputs required final_outputs list[Completions] List of generations from the LLM. It's organized as a list where the elements sent to the LLM are grouped together, then each element contains the completions, and each completion is a list of steps. required input_positions list[tuple[int, int, int]] A list with tuples generated in the process method that contains (i, j, k) where i is the index of the input, j is the index of the solution, and k is the index of the completion. required golden_answers list[str] List of golden answers for each input. required statistics list[LLMStatistics] List of statistics from the LLM. required raw_outputs list[str] List of raw outputs from the LLM. required raw_inputs list[str] List of raw inputs to the LLM. required Returns: Type Description StepInput Inputs annotated. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _auto_label(\n self,\n inputs: StepInput,\n final_outputs: list[Completions],\n input_positions: list[tuple[int, int, int]],\n golden_answers: list[str],\n statistics: list[\"LLMStatistics\"],\n raw_outputs: list[str],\n raw_inputs: list[str],\n) -> StepInput:\n \"\"\"Labels the steps inplace (in the inputs), and returns the inputs.\n\n Args:\n inputs: The original inputs\n final_outputs: List of generations from the LLM.\n It's organized as a list where the elements sent to the LLM are\n grouped together, then each element contains the completions, and\n each completion is a list of steps.\n input_positions: A list with tuples generated in the process method\n that contains (i, j, k) where i is the index of the input, j is the\n index of the solution, and k is the index of the completion.\n golden_answers: List of golden answers for each input.\n statistics: List of statistics from the LLM.\n raw_outputs: List of raw outputs from the LLM.\n raw_inputs: List of raw inputs to the LLM.\n\n Returns:\n Inputs annotated.\n \"\"\"\n for i, (instruction_i, solution_i, step_i) in enumerate(input_positions):\n input = inputs[instruction_i]\n solutions = input[\"solutions\"]\n n_completions = final_outputs[i]\n label = f\" {self.tags[1]}\"\n for completion in n_completions:\n if len(completion) == 0:\n # This can be a failed generation\n label = \"\" # Everyting stays the same\n self._logger.info(\"Completer failed due to empty completion\")\n continue\n if completion[-1] == golden_answers[instruction_i]:\n label = f\" { self.tags[0]}\"\n # If we found one, it's enough as we are doing Hard Estimation\n continue\n # In case we had no solutions from the previous step, otherwise we would have\n # an IndexError\n if not solutions[solution_i]:\n continue\n solutions[solution_i][step_i] += label\n inputs[instruction_i][\"solutions\"] = solutions\n\n for i, input in enumerate(inputs):\n solutions = input[\"solutions\"]\n new_solutions = []\n for solution in solutions:\n if not solution or (len(solution) == 1):\n # The generation may fail to generate the expected\n # completions, or just added an extra empty completion,\n # we skip it.\n # Other possible error is having a list of solutions\n # with a single item, so when we call .pop, we are left\n # with an empty list, so we skip it too.\n new_solutions.append(solution)\n continue\n\n answer = solution.pop()\n label = (\n f\" {self.tags[0]}\"\n if answer == golden_answers[i]\n else f\" {self.tags[1]}\"\n )\n solution[-1] += \" \" + answer + label\n new_solutions.append(solution)\n\n # Only add the solutions if the data was properly parsed\n input[\"solutions\"] = new_solutions if new_solutions else input[\"solutions\"]\n input = self._add_metadata(\n input, statistics[i], raw_outputs[i], raw_inputs[i]\n )\n\n return inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._add_metadata","title":"_add_metadata(input, statistics, raw_output, raw_input) ","text":"Adds the distilabel_metadata to the input. This method comes for free in the general Tasks, but as we have reimplemented the process , we have to repeat it here. Parameters: Name Type Description Default input dict[str, Any] The input to add the metadata to. required statistics list[LLMStatistics] The statistics from the LLM. required raw_output Union[str, None] The raw output from the LLM. required raw_input Union[list[dict[str, Any]], None] The raw input to the LLM. required Returns: Type Description dict[str, Any] The input with the metadata added if applies. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _add_metadata(\n self,\n input: dict[str, Any],\n statistics: list[\"LLMStatistics\"],\n raw_output: Union[str, None],\n raw_input: Union[list[dict[str, Any]], None],\n) -> dict[str, Any]:\n \"\"\"Adds the `distilabel_metadata` to the input.\n\n This method comes for free in the general Tasks, but as we have reimplemented the `process`,\n we have to repeat it here.\n\n Args:\n input: The input to add the metadata to.\n statistics: The statistics from the LLM.\n raw_output: The raw output from the LLM.\n raw_input: The raw input to the LLM.\n\n Returns:\n The input with the metadata added if applies.\n \"\"\"\n input[\"model_name\"] = self.llm.model_name\n\n if DISTILABEL_METADATA_KEY not in input:\n input[DISTILABEL_METADATA_KEY] = {}\n # If the solutions are splitted afterwards, the statistics should be splitted\n # to avoid counting extra tokens\n input[DISTILABEL_METADATA_KEY][f\"statistics_{self.name}\"] = statistics\n\n # Let some defaults in case something failed and we had None, otherwise when reading\n # the parquet files using pyarrow, the following error will appear:\n # ArrowInvalid: Schema\n if self.add_raw_input:\n input[DISTILABEL_METADATA_KEY][f\"raw_input_{self.name}\"] = raw_input or [\n {\"content\": \"\", \"role\": \"\"}\n ]\n if self.add_raw_output:\n input[DISTILABEL_METADATA_KEY][f\"raw_output_{self.name}\"] = raw_output or \"\"\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel, Field\n\nclass Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\nclass MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\nMathShepherdCompleter.model_json_schema()\n Returns: Type Description dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py @override\ndef get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdCompleter.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdGenerator","title":"MathShepherdGenerator ","text":" Bases: Task Math Shepherd solution generator. This task is in charge of generating completions for a given instruction, in the format expected by the Math Shepherd Completer task. The attributes make the task flexible to be used with different types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented in the original paper. Before modifying them, review the current defaults to ensure the completions are generated correctly. This task can be used to generate the golden solutions for a given problem if not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer. Only one of solutions or golden_solution will be generated, depending on the value of M. Attributes: Name Type Description system_prompt Optional[str] The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. Take into account that the system prompt includes 2 variables in the Jinja2 template, {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example to steer the model towards a specific type of responses, and few shots to add examples. They can be modified to adapt the system prompt to the dataset and model used without needing to change the full system prompt. extra_rules Optional[str] This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. few_shots Optional[str] Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. M Optional[PositiveInt] Number of completions to generate for each step. By default is set to 1, which will generate the \"golden_solution\". In this case select a stronger model, as it will be used as the source of true during labelling. If M is set to a number greater than 1, the task will generate a list of completions to be labeled by the Math Shepherd Completer task. Input columns - instruction (
str ): The task or instruction. Output columns - golden_solution (
str ): The step by step solution to the instruction. It will be generated if M is equal to 1. - solutions (
List[List[str]] ): A list of possible solutions to the instruction. It will be generated if M is greater than 1. - model_name (
str ): The name of the model used to generate the revision. Categories References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations Examples: Generate the solution for a given instruction (prefer a stronger model here): from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"]'}]]\n Generate M completions for a given instruction (using structured output generation): from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n)\ntask = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Source code in src/distilabel/steps/tasks/math_shepherd/generator.py class MathShepherdGenerator(Task):\n \"\"\"Math Shepherd solution generator.\n\n This task is in charge of generating completions for a given instruction, in the format expected\n by the Math Shepherd Completer task. The attributes make the task flexible to be used with different\n types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented\n in the original paper. Before modifying them, review the current defaults to ensure the completions\n are generated correctly. This task can be used to generate the golden solutions for a given problem if\n not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer.\n Only one of `solutions` or `golden_solution` will be generated, depending on the value of M.\n\n Attributes:\n system_prompt: The system prompt to be used in the completions. The default one has been\n checked and generates good completions using Llama 3.1 with 8B and 70B,\n but it can be modified to adapt it to the model and dataset selected.\n Take into account that the system prompt includes 2 variables in the Jinja2 template,\n {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example\n to steer the model towards a specific type of responses, and few shots to add examples.\n They can be modified to adapt the system prompt to the dataset and model used without needing\n to change the full system prompt.\n extra_rules: This field can be used to insert extra rules relevant to the type of dataset.\n For example, in the original paper they used GSM8K and MATH datasets, and this field\n can be used to insert the rules for the GSM8K dataset.\n few_shots: Few shots to help the model generating the completions, write them in the\n format of the type of solutions wanted for your dataset.\n M: Number of completions to generate for each step. By default is set to 1, which will\n generate the \"golden_solution\". In this case select a stronger model, as it will be used\n as the source of true during labelling. If M is set to a number greater than 1, the task\n will generate a list of completions to be labeled by the Math Shepherd Completer task.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n\n Output columns:\n - golden_solution (`str`): The step by step solution to the instruction.\n It will be generated if M is equal to 1.\n - solutions (`List[List[str]]`): A list of possible solutions to the instruction.\n It will be generated if M is greater than 1.\n - model_name (`str`): The name of the model used to generate the revision.\n\n Categories:\n - text-generation\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n\n Examples:\n Generate the solution for a given instruction (prefer a stronger model here):\n\n ```python\n from distilabel.steps.tasks import MathShepherdGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"]'}]]\n ```\n\n Generate M completions for a given instruction (using structured output generation):\n\n ```python\n from distilabel.steps.tasks import MathShepherdGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n task = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n \"\"\"\n\n system_prompt: Optional[str] = SYSTEM_PROMPT\n extra_rules: Optional[str] = RULES_GSM8K\n few_shots: Optional[str] = FEW_SHOTS_GSM8K\n M: Optional[PositiveInt] = None\n\n def load(self) -> None:\n super().load()\n if self.system_prompt is not None:\n self.system_prompt = Template(self.system_prompt).render(\n extra_rules=self.extra_rules or \"\",\n few_shots=self.few_shots or \"\",\n structured_prompt=SYSTEM_PROMPT_STRUCTURED\n if self.use_default_structured_output\n else \"\",\n )\n if self.use_default_structured_output:\n self._template = Template(TEMPLATE_STRUCTURED)\n else:\n self._template = Template(TEMPLATE)\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n if self.M:\n return [\"solutions\", \"model_name\"]\n return [\"golden_solution\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n instruction=input[\"instruction\"],\n M=self.M,\n ),\n }\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n output_name = \"solutions\" if self.M else \"golden_solution\"\n\n if output is None:\n input.update(**{output_name: None})\n return input\n\n if self.M:\n output_parsed = (\n self._format_structured_output(output)\n if self.use_default_structured_output\n else output.split(\"---\")\n )\n solutions = [split_solution_steps(o) for o in output_parsed]\n else:\n output_parsed = (\n self._format_structured_output(output)[0]\n if self.use_default_structured_output\n else output\n )\n solutions = split_solution_steps(output_parsed)\n\n input.update(**{output_name: solutions})\n return input\n\n @override\n def get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdGenerator.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(self, output: str) -> list[str]:\n default_output = [\"\"] * self.M if self.M else [\"\"]\n if parsed_output := parse_json_response(output):\n solutions = parsed_output[\"solutions\"]\n extracted_solutions = [o[\"solution\"] for o in solutions]\n if len(extracted_solutions) != self.M:\n extracted_solutions = default_output\n return extracted_solutions\n return default_output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdGenerator.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel, Field\n\nclass Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\nclass MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\nMathShepherdGenerator.model_json_schema()\n Returns: Type Description dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/math_shepherd/generator.py @override\ndef get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdGenerator.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.FormatPRM","title":"FormatPRM ","text":" Bases: Step Helper step to transform the data into the format expected by the PRM model. This step can be used to format the data in one of 2 formats: Following the format presented in peiyi9979/Math-Shepherd, in which case this step creates the columns input and label, where the input is the instruction with the solution (and the tag replaced by a token), and the label is the instruction with the solution, both separated by a newline. Following TRL's format for training, which generates the columns prompt, completions, and labels. The labels correspond to the original tags replaced by boolean values, where True represents correct steps. Attributes: Name Type Description format Literal['math-shepherd', 'trl'] The format to use for the PRM model. \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format prepared to train the model using TRL. step_token str String that serves as a unique token denoting the position for predicting the step score. tags list[str] List of tags that represent the correct and incorrect steps. This only needs to be informed if it's different than the default in MathShepherdCompleter . Input columns - instruction (
str ): The task or instruction. - solutions (
list[str] ): List of steps with a solution to the task. Output columns - input (
str ): The instruction with the solutions, where the label tags are replaced by a token. - label (
str ): The instruction with the solutions. - prompt (
str ): The instruction with the solutions, where the label tags are replaced by a token. - completions (
List[str] ): The solution represented as a list of steps. - labels (
List[bool] ): The labels, as a list of booleans, where True represents a good response. Categories - text-manipulation
- columns
References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations - peiyi9979/Math-Shepherd
Examples: Prepare your data to train a PRM model with the Math-Shepherd format: from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM()\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\nresult = next(formatter.process(result))\n Prepare your data to train a PRM model with the TRL format: from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM(format=\"trl\")\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\n\nresult = next(formatter.process(result))\n# {\n# \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"solutions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n# ],\n# \"prompt\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"completions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n# ],\n# \"labels\": [\n# true,\n# true,\n# true\n# ]\n# }\n Citations: ```\n@misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n}\n```\n Source code in src/distilabel/steps/tasks/math_shepherd/utils.py class FormatPRM(Step):\n \"\"\"Helper step to transform the data into the format expected by the PRM model.\n\n This step can be used to format the data in one of 2 formats:\n Following the format presented\n in [peiyi9979/Math-Shepherd](https://huggingface.co/datasets/peiyi9979/Math-Shepherd?row=0),\n in which case this step creates the columns input and label, where the input is the instruction\n with the solution (and the tag replaced by a token), and the label is the instruction\n with the solution, both separated by a newline.\n Following TRL's format for training, which generates the columns prompt, completions, and labels.\n The labels correspond to the original tags replaced by boolean values, where True represents\n correct steps.\n\n Attributes:\n format: The format to use for the PRM model.\n \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format\n prepared to train the model using TRL.\n step_token: String that serves as a unique token denoting the position\n for predicting the step score.\n tags: List of tags that represent the correct and incorrect steps.\n This only needs to be informed if it's different than the default in\n `MathShepherdCompleter`.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n - solutions (`list[str]`): List of steps with a solution to the task.\n\n Output columns:\n - input (`str`): The instruction with the solutions, where the label tags\n are replaced by a token.\n - label (`str`): The instruction with the solutions.\n - prompt (`str`): The instruction with the solutions, where the label tags\n are replaced by a token.\n - completions (`List[str]`): The solution represented as a list of steps.\n - labels (`List[bool]`): The labels, as a list of booleans, where True represents\n a good response.\n\n Categories:\n - text-manipulation\n - columns\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n - [peiyi9979/Math-Shepherd](https://huggingface.co/datasets/peiyi9979/Math-Shepherd?row=0)\n\n Examples:\n Prepare your data to train a PRM model with the Math-Shepherd format:\n\n ```python\n from distilabel.steps.tasks import FormatPRM\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(columns=[\"solutions\"])\n expand_columns.load()\n\n # Define our PRM formatter\n formatter = FormatPRM()\n formatter.load()\n\n # Expand the solutions column as it comes from the MathShepherdCompleter\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n )\n result = next(formatter.process(result))\n ```\n\n Prepare your data to train a PRM model with the TRL format:\n\n ```python\n from distilabel.steps.tasks import FormatPRM\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(columns=[\"solutions\"])\n expand_columns.load()\n\n # Define our PRM formatter\n formatter = FormatPRM(format=\"trl\")\n formatter.load()\n\n # Expand the solutions column as it comes from the MathShepherdCompleter\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n )\n\n result = next(formatter.process(result))\n # {\n # \"instruction\": \"Janet\\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # \"solutions\": [\n # \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n # \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n # \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n # ],\n # \"prompt\": \"Janet\\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # \"completions\": [\n # \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n # \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n # \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n # ],\n # \"labels\": [\n # true,\n # true,\n # true\n # ]\n # }\n ```\n\n Citations:\n\n ```\n @misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n }\n ```\n \"\"\"\n\n format: Literal[\"math-shepherd\", \"trl\"] = \"math-shepherd\"\n step_token: str = \"\u043a\u0438\"\n tags: list[str] = [\"+\", \"-\"]\n\n def model_post_init(self, __context: Any) -> None:\n super().model_post_init(__context)\n if self.format == \"math-shepherd\":\n self._formatter = self._format_math_shepherd\n else:\n self._formatter = self._format_trl\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\", \"solutions\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n if self.format == \"math-shepherd\":\n return [\"input\", \"label\"]\n return [\"prompt\", \"completions\", \"labels\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The process prepares the data for the `APIGenGenerator` task.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n self._formatter(input)\n\n yield inputs # type: ignore\n\n def _format_math_shepherd(\n self, input: dict[str, str]\n ) -> dict[str, Union[str, list[str]]]:\n instruction = input[\"instruction\"]\n replaced = []\n # At this stage, the \"solutions\" column can only contain a single solution,\n # and the last item of each solution is the tag.\n solution = input[\"solutions\"]\n for step in solution:\n # Check there's a string, because the step that generated\n # the solutions could have failed, and we would have an empty list.\n replaced.append(step[:-1] + self.step_token if len(step) > 1 else step)\n\n input[\"input\"] = instruction + \" \" + \"\\n\".join(replaced)\n input[\"label\"] = instruction + \" \" + \"\\n\".join(solution)\n\n return input # type: ignore\n\n def _format_trl(\n self, input: dict[str, str]\n ) -> dict[str, Union[str, list[str], list[bool]]]:\n input[\"prompt\"] = input[\"instruction\"]\n completions: list[str] = []\n labels: list[bool] = []\n for step in input[\"solutions\"]:\n token = step[-1]\n completions.append(step[:-1].strip())\n labels.append(True if token == self.tags[0] else False)\n\n input[\"completions\"] = completions # type: ignore\n input[\"labels\"] = labels # type: ignore\n\n return input # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.FormatPRM.process","title":"process(inputs) ","text":"The process prepares the data for the APIGenGenerator task. If a single example is provided, it is copied to avoid raising an error. Parameters: Name Type Description Default inputs StepInput A list of dictionaries with the input data. required Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/tasks/math_shepherd/utils.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The process prepares the data for the `APIGenGenerator` task.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n self._formatter(input)\n\n yield inputs # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM","title":"PairRM ","text":" Bases: Step Rank the candidates based on the input using the LLM model. Attributes: Name Type Description model str The model to use for the ranking. Defaults to \"llm-blender/PairRM\" . instructions Optional[str] The instructions to use for the model. Defaults to None . Input columns - inputs (
List[Dict[str, Any]] ): The input text or conversation to rank the candidates for. - candidates (
List[Dict[str, Any]] ): The candidates to rank. Output columns - ranks (
List[int] ): The ranks of the candidates based on the input. - ranked_candidates (
List[Dict[str, Any]] ): The candidates ranked based on the input. - model_name (
str ): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\" . References - LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion.
- Pair Ranking Model.
Categories Note This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM . Examples: Rank LLM candidates: from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'input': 'Hello, how are you?',\n# 'candidates': ['fine', 'good', 'bad'],\n# 'ranks': [2, 1, 3],\n# 'ranked_candidates': ['good', 'fine', 'bad'],\n# 'model_name': 'llm-blender/PairRM',\n# }\n# ]\n Citations @misc{jiang2023llmblenderensemblinglargelanguage,\n title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n year={2023},\n eprint={2306.02561},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2306.02561},\n}\n Source code in src/distilabel/steps/tasks/pair_rm.py class PairRM(Step):\n \"\"\"Rank the candidates based on the input using the `LLM` model.\n\n Attributes:\n model: The model to use for the ranking. Defaults to `\"llm-blender/PairRM\"`.\n instructions: The instructions to use for the model. Defaults to `None`.\n\n Input columns:\n - inputs (`List[Dict[str, Any]]`): The input text or conversation to rank the candidates for.\n - candidates (`List[Dict[str, Any]]`): The candidates to rank.\n\n Output columns:\n - ranks (`List[int]`): The ranks of the candidates based on the input.\n - ranked_candidates (`List[Dict[str, Any]]`): The candidates ranked based on the input.\n - model_name (`str`): The model name used to rank the candidate responses. Defaults to `\"llm-blender/PairRM\"`.\n\n References:\n - [LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion](https://arxiv.org/abs/2306.02561).\n - [Pair Ranking Model](https://huggingface.co/llm-blender/PairRM).\n\n Categories:\n - preference\n\n Note:\n This step differs to other tasks as there is a single implementation of this model\n currently, and we will use a specific `LLM`.\n\n Examples:\n Rank LLM candidates:\n\n ```python\n from distilabel.steps.tasks import PairRM\n\n # Consider this as a placeholder for your actual LLM.\n pair_rm = PairRM()\n\n pair_rm.load()\n\n result = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n )\n # result\n # [\n # {\n # 'input': 'Hello, how are you?',\n # 'candidates': ['fine', 'good', 'bad'],\n # 'ranks': [2, 1, 3],\n # 'ranked_candidates': ['good', 'fine', 'bad'],\n # 'model_name': 'llm-blender/PairRM',\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{jiang2023llmblenderensemblinglargelanguage,\n title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n year={2023},\n eprint={2306.02561},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2306.02561},\n }\n ```\n \"\"\"\n\n model: str = \"llm-blender/PairRM\"\n instructions: Optional[str] = None\n\n def load(self) -> None:\n \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n custom library for running the inference for the PairRM models.\"\"\"\n try:\n import llm_blender\n except ImportError as e:\n raise ImportError(\n \"The `llm_blender` package is required to use the `PairRM` class.\"\n \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n ) from e\n\n self._blender = llm_blender.Blender()\n self._blender.loadranker(self.model)\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The input columns correspond to the two required arguments from `Blender.rank`:\n `inputs` and `candidates`.\"\"\"\n return [\"input\", \"candidates\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs will include the `ranks` and the `ranked_candidates`.\"\"\"\n return [\"ranks\", \"ranked_candidates\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n where the `input` corresponds to the instruction of a model and `candidates` are a\n list of responses to be ranked.\n \"\"\"\n return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates the ranks for the candidates based on the input.\n\n The ranks are the positions of the candidates, where lower is better,\n and the ranked candidates correspond to the candidates sorted according to the\n ranks obtained.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n \"\"\"\n input_texts = []\n candidates = []\n for input in inputs:\n formatted_input = self.format_input(input)\n input_texts.append(formatted_input[\"input\"])\n candidates.append(formatted_input[\"candidates\"])\n\n instructions = (\n [self.instructions] * len(input_texts) if self.instructions else None\n )\n\n ranks = self._blender.rank(\n input_texts,\n candidates,\n instructions=instructions,\n return_scores=False,\n batch_size=self.input_batch_size,\n )\n # Sort the candidates based on the ranks\n ranked_candidates = np.take_along_axis(\n np.array(candidates), ranks - 1, axis=1\n ).tolist()\n ranks = ranks.tolist()\n for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n input[\"ranks\"] = rank\n input[\"ranked_candidates\"] = ranked_candidate\n input[\"model_name\"] = self.model\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.inputs","title":"inputs: StepColumns property ","text":"The input columns correspond to the two required arguments from Blender.rank : inputs and candidates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.outputs","title":"outputs: StepColumns property ","text":"The outputs will include the ranks and the ranked_candidates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.load","title":"load() ","text":"Loads the PairRM model provided via model with llm_blender.Blender , which is the custom library for running the inference for the PairRM models. Source code in src/distilabel/steps/tasks/pair_rm.py def load(self) -> None:\n \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n custom library for running the inference for the PairRM models.\"\"\"\n try:\n import llm_blender\n except ImportError as e:\n raise ImportError(\n \"The `llm_blender` package is required to use the `PairRM` class.\"\n \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n ) from e\n\n self._blender = llm_blender.Blender()\n self._blender.loadranker(self.model)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.format_input","title":"format_input(input) ","text":"The input is expected to be a dictionary with the keys input and candidates , where the input corresponds to the instruction of a model and candidates are a list of responses to be ranked. Source code in src/distilabel/steps/tasks/pair_rm.py def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n where the `input` corresponds to the instruction of a model and `candidates` are a\n list of responses to be ranked.\n \"\"\"\n return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.process","title":"process(inputs) ","text":"Generates the ranks for the candidates based on the input. The ranks are the positions of the candidates, where lower is better, and the ranked candidates correspond to the candidates sorted according to the ranks obtained. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput An iterator with the inputs containing the ranks , ranked_candidates , and model_name . Source code in src/distilabel/steps/tasks/pair_rm.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates the ranks for the candidates based on the input.\n\n The ranks are the positions of the candidates, where lower is better,\n and the ranked candidates correspond to the candidates sorted according to the\n ranks obtained.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n \"\"\"\n input_texts = []\n candidates = []\n for input in inputs:\n formatted_input = self.format_input(input)\n input_texts.append(formatted_input[\"input\"])\n candidates.append(formatted_input[\"candidates\"])\n\n instructions = (\n [self.instructions] * len(input_texts) if self.instructions else None\n )\n\n ranks = self._blender.rank(\n input_texts,\n candidates,\n instructions=instructions,\n return_scores=False,\n batch_size=self.input_batch_size,\n )\n # Sort the candidates based on the ranks\n ranked_candidates = np.take_along_axis(\n np.array(candidates), ranks - 1, axis=1\n ).tolist()\n ranks = ranks.tolist()\n for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n input[\"ranks\"] = rank\n input[\"ranked_candidates\"] = ranked_candidate\n input[\"model_name\"] = self.model\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval","title":"PrometheusEval ","text":" Bases: Task Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness , harmlessness , honesty , factual-validity , and reasoning , that can be overridden via rubrics , and the selected rubric is set via the attribute rubric . Note The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too. Attributes: Name Type Description mode Literal['absolute', 'relative'] the evaluation mode to use, either absolute or relative . It defines whether the task will evaluate one or two generations. rubric str the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness , harmlessness , honesty , factual-validity , or reasoning . Those will only work if using the default rubrics , otherwise, the provided rubrics should be used. rubrics Optional[Dict[str, str]] a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness , harmlessness , honesty , factual-validity , and reasoning . reference bool a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs. _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instruction (
str ): The instruction to use as reference. - generation (
str , optional): The generated text from the given instruction . This column is required if mode=absolute . - generations (
List[str] , optional): The generated texts from the given instruction . It should contain 2 generations only. This column is required if mode=relative . - reference (
str , optional): The reference / golden answer for the instruction , to be used by the LLM for comparison against. Output columns - feedback (
str ): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided. - result (
Union[int, Literal[\"A\", \"B\"]] ): If mode=absolute , then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative , then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B' . - model_name (
str ): The model name used to generate the feedback and result . Categories References - Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models
- prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf
Examples: Critique and evaluate LLM generation quality using Prometheus 2_0: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Critique for relative evaluation: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generations': ['something done', 'other thing'],\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 'something done',\n# }\n# ]\n Critique with a custom rubric: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n }\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Critique using a reference answer: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'reference': 'this is a reference answer',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Citations @misc{kim2024prometheus2opensource,\n title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n year={2024},\n eprint={2405.01535},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2405.01535},\n}\n Source code in src/distilabel/steps/tasks/prometheus_eval.py class PrometheusEval(Task):\n \"\"\"Critique and rank the quality of generations from an `LLM` using Prometheus 2.0.\n\n `PrometheusEval` is a task created for Prometheus 2.0, covering both the absolute and relative\n evaluations. The absolute evaluation i.e. `mode=\"absolute\"` is used to evaluate a single generation from\n an LLM for a given instruction. The relative evaluation i.e. `mode=\"relative\"` is used to evaluate two generations from an LLM\n for a given instruction.\n Both evaluations provide the possibility of using a reference answer to compare with or withoug\n the `reference` attribute, and both are based on a score rubric that critiques the generation/s\n based on the following default aspects: `helpfulness`, `harmlessness`, `honesty`, `factual-validity`,\n and `reasoning`, that can be overridden via `rubrics`, and the selected rubric is set via the attribute\n `rubric`.\n\n Note:\n The `PrometheusEval` task is better suited and intended to be used with any of the Prometheus 2.0\n models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0,\n and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting\n and quality is not guaranteed if using another model, even though some other models may be able to\n correctly follow the formatting and generate insightful critiques too.\n\n Attributes:\n mode: the evaluation mode to use, either `absolute` or `relative`. It defines whether the task\n will evaluate one or two generations.\n rubric: the score rubric to use within the prompt to run the critique based on different aspects.\n Can be any existing key in the `rubrics` attribute, which by default means that it can be:\n `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, or `reasoning`. Those will only\n work if using the default `rubrics`, otherwise, the provided `rubrics` should be used.\n rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are\n the rubric names and the values are the rubric descriptions. The default rubrics are the following:\n `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, and `reasoning`.\n reference: a boolean flag to indicate whether a reference answer / completion will be provided, so\n that the model critique is based on the comparison with it. It implies that the column `reference`\n needs to be provided within the input data in addition to the rest of the inputs.\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instruction (`str`): The instruction to use as reference.\n - generation (`str`, optional): The generated text from the given `instruction`. This column is required\n if `mode=absolute`.\n - generations (`List[str]`, optional): The generated texts from the given `instruction`. It should\n contain 2 generations only. This column is required if `mode=relative`.\n - reference (`str`, optional): The reference / golden answer for the `instruction`, to be used by the LLM\n for comparison against.\n\n Output columns:\n - feedback (`str`): The feedback explaining the result below, as critiqued by the LLM using the\n pre-defined score rubric, compared against `reference` if provided.\n - result (`Union[int, Literal[\"A\", \"B\"]]`): If `mode=absolute`, then the result contains the score for the\n `generation` in a likert-scale from 1-5, otherwise, if `mode=relative`, then the result contains either\n \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of `generations` if `result='A'` or the\n index 1 if `result='B'`.\n - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n Categories:\n - critique\n - preference\n\n References:\n - [Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models](https://arxiv.org/abs/2405.01535)\n - [prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf](https://github.com/prometheus-eval/prometheus-eval)\n\n Examples:\n Critique and evaluate LLM generation quality using Prometheus 2_0:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Critique for relative evaluation:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generations': ['something done', 'other thing'],\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 'something done',\n # }\n # ]\n ```\n\n Critique with a custom rubric:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\\\nScore 1: A\\\\nScore 2: B\\\\nScore 3: C\\\\nScore 4: D\\\\nScore 5: E\"\n }\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Critique using a reference answer:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'reference': 'this is a reference answer',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{kim2024prometheus2opensource,\n title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n year={2024},\n eprint={2405.01535},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2405.01535},\n }\n ```\n \"\"\"\n\n mode: Literal[\"absolute\", \"relative\"]\n rubric: str\n rubrics: Optional[Dict[str, str]] = Field(default=_DEFAULT_RUBRICS)\n reference: bool = False\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n @model_validator(mode=\"after\")\n def validate_rubric_and_rubrics(self) -> Self:\n if not isinstance(self.rubrics, dict) or len(self.rubrics) < 1:\n raise DistilabelUserError(\n \"Provided `rubrics` must be a Python dictionary with string keys and string values.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n def rubric_matches_pattern(rubric: str) -> bool:\n \"\"\"Checks if the provided rubric matches the pattern of the default rubrics.\"\"\"\n pattern = r\"^\\[.*?\\]\\n(?:Score [1-4]: .*?\\n){4}(?:Score 5: .*?)\"\n return bool(re.match(pattern, rubric, re.MULTILINE))\n\n if not all(rubric_matches_pattern(value) for value in self.rubrics.values()):\n raise DistilabelUserError(\n \"Provided rubrics should match the format of the default rubrics, which\"\n \" is as follows: `[<scoring criteria>]\\nScore 1: <description>\\nScore 2: <description>\\n\"\n \"Score 3: <description>\\nScore 4: <description>\\nScore 5: <description>`; replacing\"\n \" `<scoring criteria>` and `<description>` with the actual criteria and description\"\n \" for each or the scores, respectively.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n if self.rubric not in self.rubrics:\n raise DistilabelUserError(\n f\"Provided rubric '{self.rubric}' is not among the available rubrics: {', '.join(self.rubrics.keys())}.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n return self\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n depending on the `mode` value, and either with or without reference, depending on the\n value of `reference`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"prometheus\"\n / (\n f\"{self.mode}_without_reference.jinja2\"\n if self.reference is False\n else f\"{self.mode}_with_reference.jinja2\"\n )\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The default inputs for the task are the `instruction` and the `generation`\n if `reference=False`, otherwise, the inputs are `instruction`, `generation`, and\n `reference`.\"\"\"\n if self.mode == \"absolute\":\n if self.reference:\n return [\"instruction\", \"generation\", \"reference\"]\n return [\"instruction\", \"generation\"]\n else:\n if self.reference:\n return [\"instruction\", \"generations\", \"reference\"]\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n from the user, including a pre-defined system prompt.\"\"\"\n template_kwargs = {\n \"instruction\": input[\"instruction\"],\n \"rubric\": self.rubrics[self.rubric],\n }\n if self.reference:\n template_kwargs[\"reference\"] = input[\"reference\"]\n\n if self.mode == \"absolute\":\n if not isinstance(input[\"generation\"], str):\n raise DistilabelUserError(\n f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n \" should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generation\"] = input[\"generation\"]\n system_message = (\n \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n \" for performance.\"\n )\n else: # self.mode == \"relative\"\n if (\n not isinstance(input[\"generations\"], list)\n or not all(\n isinstance(generation, str) for generation in input[\"generations\"]\n )\n or len(input[\"generations\"]) != 2\n ):\n raise DistilabelUserError(\n f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generations\"] = input[\"generations\"]\n system_message = (\n \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n \" individual performances, highlighting how each stands relative to others within the\"\n \" same cohort.\"\n )\n\n return [\n {\n \"role\": \"system\",\n \"content\": system_message,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(**template_kwargs), # type: ignore\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `feedback` and the `result` generated by Prometheus,\n as well as the `model_name` which is automatically included based on the `LLM` used.\n \"\"\"\n return [\"feedback\", \"result\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n using a regex from the Prometheus output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Optionally provided in case it's useful to build the output.\n\n Returns:\n A dict with the keys `feedback` and `result` generated by the LLM.\n \"\"\"\n if output is None:\n return {\"feedback\": None, \"result\": None}\n\n parts = output.split(\"[RESULT]\")\n if len(parts) != 2:\n return {\"feedback\": None, \"result\": None}\n\n feedback, result = parts[0].strip(), parts[1].strip()\n if feedback.startswith(\"Feedback:\"):\n feedback = feedback[len(\"Feedback:\") :].strip()\n if self.mode == \"absolute\":\n if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": int(result)}\n else: # self.mode == \"relative\"\n if result not in [\"A\", \"B\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": result}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.inputs","title":"inputs: List[str] property ","text":"The default inputs for the task are the instruction and the generation if reference=False , otherwise, the inputs are instruction , generation , and reference . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.outputs","title":"outputs: List[str] property ","text":"The output for the task are the feedback and the result generated by Prometheus, as well as the model_name which is automatically included based on the LLM used. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.load","title":"load() ","text":"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation depending on the mode value, and either with or without reference, depending on the value of reference . Source code in src/distilabel/steps/tasks/prometheus_eval.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n depending on the `mode` value, and either with or without reference, depending on the\n value of `reference`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"prometheus\"\n / (\n f\"{self.mode}_without_reference.jinja2\"\n if self.reference is False\n else f\"{self.mode}_with_reference.jinja2\"\n )\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType where the prompt is formatted according to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction from the user, including a pre-defined system prompt. Source code in src/distilabel/steps/tasks/prometheus_eval.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n from the user, including a pre-defined system prompt.\"\"\"\n template_kwargs = {\n \"instruction\": input[\"instruction\"],\n \"rubric\": self.rubrics[self.rubric],\n }\n if self.reference:\n template_kwargs[\"reference\"] = input[\"reference\"]\n\n if self.mode == \"absolute\":\n if not isinstance(input[\"generation\"], str):\n raise DistilabelUserError(\n f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n \" should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generation\"] = input[\"generation\"]\n system_message = (\n \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n \" for performance.\"\n )\n else: # self.mode == \"relative\"\n if (\n not isinstance(input[\"generations\"], list)\n or not all(\n isinstance(generation, str) for generation in input[\"generations\"]\n )\n or len(input[\"generations\"]) != 2\n ):\n raise DistilabelUserError(\n f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generations\"] = input[\"generations\"]\n system_message = (\n \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n \" individual performances, highlighting how each stands relative to others within the\"\n \" same cohort.\"\n )\n\n return [\n {\n \"role\": \"system\",\n \"content\": system_message,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(**template_kwargs), # type: ignore\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dict with the keys feedback and result captured using a regex from the Prometheus output. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Optionally provided in case it's useful to build the output. required Returns: Type Description Dict[str, Any] A dict with the keys feedback and result generated by the LLM. Source code in src/distilabel/steps/tasks/prometheus_eval.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n using a regex from the Prometheus output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Optionally provided in case it's useful to build the output.\n\n Returns:\n A dict with the keys `feedback` and `result` generated by the LLM.\n \"\"\"\n if output is None:\n return {\"feedback\": None, \"result\": None}\n\n parts = output.split(\"[RESULT]\")\n if len(parts) != 2:\n return {\"feedback\": None, \"result\": None}\n\n feedback, result = parts[0].strip(), parts[1].strip()\n if feedback.startswith(\"Feedback:\"):\n feedback = feedback[len(\"Feedback:\") :].strip()\n if self.mode == \"absolute\":\n if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": int(result)}\n else: # self.mode == \"relative\"\n if result not in [\"A\", \"B\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": result}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer","title":"QualityScorer ","text":" Bases: Task Score responses based on their quality using an LLM . QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instruction (
str ): The instruction that was used to generate the responses . - responses (
List[str] ): The responses to be scored. Each response forms a pair with the instruction. Output columns - scores (
List[float] ): The score for each instruction. - model_name (
str ): The model name used to generate the scores. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evaluate the quality of your instructions: from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n# result\n[\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n]\n Generate structured output with default schema: from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/quality_scorer.py class QualityScorer(Task):\n \"\"\"Score responses based on their quality using an `LLM`.\n\n `QualityScorer` is a pre-defined task that defines the `instruction` as the input\n and `score` as the output. This task is used to rate the quality of instructions and responses.\n It's an implementation of the quality score task from the paper 'What Makes Good Data\n for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs\n are scored in terms of quality, obtaining a quality score for each instruction.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the `responses`.\n - responses (`List[str]`): The responses to be scored. Each response forms a pair with the instruction.\n\n Output columns:\n - scores (`List[float]`): The score for each instruction.\n - model_name (`str`): The model name used to generate the scores.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evaluate the quality of your instructions:\n\n ```python\n from distilabel.steps.tasks import QualityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n )\n # result\n [\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n ]\n ```\n\n Generate structured output with default schema:\n\n ```python\n from distilabel.steps.tasks import QualityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n scorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n )\n\n # result\n [{'instruction': 'instruction',\n 'responses': ['good response', 'weird response', 'bad response'],\n 'scores': [1, 2, 3],\n 'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"quality-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are `instruction` and `responses`.\"\"\"\n return [\"instruction\", \"responses\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], responses=input[\"responses\"]\n ),\n }\n ]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `scores` containing the quality score for each\n response in `responses`.\"\"\"\n return [\"scores\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"responses\"]) - 1:\n break\n return {\"scores\": scores}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaQualityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaQualityScorer\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the scores, and a list with them.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n {\n \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n \"responses\": [\n f\"<PLACEHOLDER_{f'RESPONSE_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are instruction and responses . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.outputs","title":"outputs property ","text":"The output for the task is a list of scores containing the quality score for each response in responses . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/quality_scorer.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"quality-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/quality_scorer.py def format_input(self, input: Dict[str, Any]) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], responses=input[\"responses\"]\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction-response pair. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction-response pair. Source code in src/distilabel/steps/tasks/quality_scorer.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"responses\"]) - 1:\n break\n return {\"scores\": scores}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaQualityScorer(BaseModel):\n scores: List[int]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/quality_scorer.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaQualityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaQualityScorer\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with the scores, and a list with them. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/quality_scorer.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the scores, and a list with them.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"responses\"])}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct","title":"SelfInstruct ","text":" Bases: Task Generate instructions based on a given input using an LLM . SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\". Attributes: Name Type Description num_instructions int The number of instructions to be generated. Defaults to 5. criteria_for_query_generation str The criteria for the query generation. Defaults to the criteria defined within the paper. application_description str The description of the AI application that one want to build with these instructions. Defaults to AI assistant . Input columns - input (
str ): The input to generate the instructions. It's also called seed in the paper. Output columns - instructions (
List[str] ): The generated instructions. - model_name (
str ): The model name used to generate the instructions. Categories Reference Self-Instruct: Aligning Language Models with Self-Generated Instructions Examples: Generate instructions based on a given input: from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n# {\n# 'input': 'instruction',\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n# }\n# ]\n Citations @misc{wang2023selfinstructaligninglanguagemodels,\n title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n year={2023},\n eprint={2212.10560},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2212.10560},\n}\n Source code in src/distilabel/steps/tasks/self_instruct.py class SelfInstruct(Task):\n \"\"\"Generate instructions based on a given input using an `LLM`.\n\n `SelfInstruct` is a pre-defined task that, given a number of instructions, a\n certain criteria for query generations, an application description, and an input,\n generates a number of instruction related to the given input and following what\n is stated in the criteria for query generation and the application description.\n It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning\n Language Models with Self-Generated Instructions\".\n\n Attributes:\n num_instructions: The number of instructions to be generated. Defaults to 5.\n criteria_for_query_generation: The criteria for the query generation. Defaults\n to the criteria defined within the paper.\n application_description: The description of the AI application that one want\n to build with these instructions. Defaults to `AI assistant`.\n\n Input columns:\n - input (`str`): The input to generate the instructions. It's also called seed in\n the paper.\n\n Output columns:\n - instructions (`List[str]`): The generated instructions.\n - model_name (`str`): The model name used to generate the instructions.\n\n Categories:\n - text-generation\n\n Reference:\n - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)\n\n Examples:\n Generate instructions based on a given input:\n\n ```python\n from distilabel.steps.tasks import SelfInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n self_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n )\n\n self_instruct.load()\n\n result = next(self_instruct.process([{\"input\": \"instruction\"}]))\n # result\n # [\n # {\n # 'input': 'instruction',\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{wang2023selfinstructaligninglanguagemodels,\n title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n year={2023},\n eprint={2212.10560},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2212.10560},\n }\n ```\n \"\"\"\n\n num_instructions: int = 5\n criteria_for_query_generation: str = (\n \"Incorporate a diverse range of verbs, avoiding repetition.\\n\"\n \"Ensure queries are compatible with AI model's text generation functions and are limited to 1-2 sentences.\\n\"\n \"Design queries to be self-contained and standalone.\\n\"\n 'Blend interrogative (e.g., \"What is the significance of x?\") and imperative (e.g., \"Detail the process of x.\") styles.'\n )\n application_description: str = \"AI assistant\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"self-instruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `input` i.e. seed text.\"\"\"\n return [\"input\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n input=input[\"input\"],\n application_description=self.application_description,\n criteria_for_query_generation=self.criteria_for_query_generation,\n num_instructions=self.num_instructions,\n ),\n }\n ]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"instructions\", \"model_name\"]\n\n def format_output(\n self,\n output: Union[str, None],\n input: Optional[Dict[str, Any]] = None,\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the generated instructions.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with containing the generated instructions.\n \"\"\"\n if output is None:\n return {\"instructions\": []}\n return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.inputs","title":"inputs: List[str] property ","text":"The input for the task is the input i.e. seed text. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.outputs","title":"outputs property ","text":"The output for the task is a list of instructions containing the generated instructions. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/self_instruct.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"self-instruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/self_instruct.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n input=input[\"input\"],\n application_description=self.application_description,\n criteria_for_query_generation=self.criteria_for_query_generation,\n num_instructions=self.num_instructions,\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a list with the generated instructions. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Optional[Dict[str, Any]] the input to the task. Used for obtaining the number of responses. None Returns: Type Description Dict[str, Any] A dict with containing the generated instructions. Source code in src/distilabel/steps/tasks/self_instruct.py def format_output(\n self,\n output: Union[str, None],\n input: Optional[Dict[str, Any]] = None,\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the generated instructions.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with containing the generated instructions.\n \"\"\"\n if output is None:\n return {\"instructions\": []}\n return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair","title":"GenerateSentencePair ","text":" Bases: Task Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models. Attributes: Name Type Description triplet bool a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False . action GenerationAction the action to perform to generate the positive sentence. context str the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default. hard_negative bool A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity. Input columns - anchor (
str ): The anchor sentence to generate the positive and negative sentences. Output columns - positive (
str ): The positive sentence related to the anchor . - negative (
str ): The negative sentence unrelated to the anchor if triplet=True , or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True . - model_name (
str ): The name of the model that was used to generate the sentences. Categories Examples: Paraphrasing: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n Generating semantically similar sentences: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n Generating queries: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n Generating answers: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n Generating queries with context (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Generating Hard-negatives (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Generating structured data with default schema (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Source code in src/distilabel/steps/tasks/sentence_transformers.py class GenerateSentencePair(Task):\n \"\"\"Generate a positive and negative (optionally) sentences given an anchor sentence.\n\n `GenerateSentencePair` is a pre-defined task that given an anchor sentence generates\n a positive sentence related to the anchor and optionally a negative sentence unrelated\n to the anchor or similar to it. Optionally, you can give a context to guide the LLM\n towards more specific behavior. This task is useful to generate training datasets for\n training embeddings models.\n\n Attributes:\n triplet: a flag to indicate if the task should generate a triplet of sentences\n (anchor, positive, negative). Defaults to `False`.\n action: the action to perform to generate the positive sentence.\n context: the context to use for the generation. Can be helpful to guide the LLM\n towards more specific context. Not used by default.\n hard_negative: A flag to indicate if the negative should be a hard-negative or not.\n Hard negatives make it hard for the model to distinguish against the positive,\n with a higher degree of semantic similarity.\n\n Input columns:\n - anchor (`str`): The anchor sentence to generate the positive and negative sentences.\n\n Output columns:\n - positive (`str`): The positive sentence related to the `anchor`.\n - negative (`str`): The negative sentence unrelated to the `anchor` if `triplet=True`,\n or more similar to the positive to make it more challenging for a model to distinguish\n in case `hard_negative=True`.\n - model_name (`str`): The name of the model that was used to generate the sentences.\n\n Categories:\n - embedding\n\n Examples:\n Paraphrasing:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n ```\n\n Generating semantically similar sentences:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps.tasks import GenerateSentencePair\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n ```\n\n Generating queries:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n ```\n\n Generating answers:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n ```\n\n Generating queries with context (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n\n Generating Hard-negatives (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n\n Generating structured data with default schema (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n \"\"\"\n\n triplet: bool = False\n action: GenerationAction\n hard_negative: bool = False\n context: str = \"\"\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"generate-sentence-pair.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task is the `anchor` sentence.\"\"\"\n return [\"anchor\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n task of generating a positive and negative sentences for the anchor sentence. The\n anchor is provided as the first user interaction in the conversation.\n\n Args:\n input: The input containing the `anchor` sentence.\n\n Returns:\n A list of dictionaries containing the system and user interactions.\n \"\"\"\n action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n format_system_prompt = {\n \"action_sentence\": action_sentence,\n \"context\": CONTEXT_INTRO if self.context else \"\",\n }\n if self.triplet:\n format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n \"hard-negative\" if self.hard_negative else \"negative\"\n ]\n\n system_prompt = (\n POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n ).format(**format_system_prompt)\n\n return [\n {\"role\": \"system\", \"content\": system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n anchor=input[\"anchor\"],\n context=self.context if self.context else None,\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The outputs for the task are the `positive` and `negative` sentences, as well\n as the `model_name` used to generate the sentences.\"\"\"\n columns = [\"positive\", \"negative\"] if self.triplet else [\"positive\"]\n columns += [\"model_name\"]\n return columns\n\n def format_output(\n self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n generated. If the output is `None` or the regex doesn't match, then the outputs\n will be set to `None` as well.\n\n Args:\n output: The output of the LLM.\n input: The input used to generate the output.\n\n Returns:\n The formatted output containing the `positive` and `negative` sentences.\n \"\"\"\n if output is None:\n return {\"positive\": None, \"negative\": None}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output)\n\n match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n if match is None:\n formatted_output = {\"positive\": None}\n if self.triplet:\n formatted_output[\"negative\"] = None\n return formatted_output\n\n groups = match.groups()\n if self.triplet:\n return {\n \"positive\": groups[0].strip(),\n \"negative\": (\n groups[1].strip()\n if len(groups) > 1 and groups[1] is not None\n else None\n ),\n }\n\n return {\"positive\": groups[0].strip()}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.triplet:\n return {\n \"properties\": {\n \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n },\n \"required\": [\"positive\", \"negative\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n \"required\": [\"positive\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(self, output: str) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.triplet:\n return {\"positive\": None, \"negative\": None}\n return {\"positive\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.inputs","title":"inputs: List[str] property ","text":"The inputs for the task is the anchor sentence. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.outputs","title":"outputs: List[str] property ","text":"The outputs for the task are the positive and negative sentences, as well as the model_name used to generate the sentences. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/sentence_transformers.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"generate-sentence-pair.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_input","title":"format_input(input) ","text":"The inputs are formatted as a ChatType , with a system prompt describing the task of generating a positive and negative sentences for the anchor sentence. The anchor is provided as the first user interaction in the conversation. Parameters: Name Type Description Default input Dict[str, Any] The input containing the anchor sentence. required Returns: Type Description ChatType A list of dictionaries containing the system and user interactions. Source code in src/distilabel/steps/tasks/sentence_transformers.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n task of generating a positive and negative sentences for the anchor sentence. The\n anchor is provided as the first user interaction in the conversation.\n\n Args:\n input: The input containing the `anchor` sentence.\n\n Returns:\n A list of dictionaries containing the system and user interactions.\n \"\"\"\n action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n format_system_prompt = {\n \"action_sentence\": action_sentence,\n \"context\": CONTEXT_INTRO if self.context else \"\",\n }\n if self.triplet:\n format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n \"hard-negative\" if self.hard_negative else \"negative\"\n ]\n\n system_prompt = (\n POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n ).format(**format_system_prompt)\n\n return [\n {\"role\": \"system\", \"content\": system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n anchor=input[\"anchor\"],\n context=self.context if self.context else None,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_output","title":"format_output(output, input=None) ","text":"Formats the output of the LLM, to extract the positive and negative sentences generated. If the output is None or the regex doesn't match, then the outputs will be set to None as well. Parameters: Name Type Description Default output Union[str, None] The output of the LLM. required input Optional[Dict[str, Any]] The input used to generate the output. None Returns: Type Description Dict[str, Any] The formatted output containing the positive and negative sentences. Source code in src/distilabel/steps/tasks/sentence_transformers.py def format_output(\n self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n) -> Dict[str, Any]:\n \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n generated. If the output is `None` or the regex doesn't match, then the outputs\n will be set to `None` as well.\n\n Args:\n output: The output of the LLM.\n input: The input used to generate the output.\n\n Returns:\n The formatted output containing the `positive` and `negative` sentences.\n \"\"\"\n if output is None:\n return {\"positive\": None, \"negative\": None}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output)\n\n match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n if match is None:\n formatted_output = {\"positive\": None}\n if self.triplet:\n formatted_output[\"negative\"] = None\n return formatted_output\n\n groups = match.groups()\n if self.triplet:\n return {\n \"positive\": groups[0].strip(),\n \"negative\": (\n groups[1].strip()\n if len(groups) > 1 and groups[1] is not None\n else None\n ),\n }\n\n return {\"positive\": groups[0].strip()}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/sentence_transformers.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.triplet:\n return {\n \"properties\": {\n \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n },\n \"required\": [\"positive\", \"negative\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n \"required\": [\"positive\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair._format_structured_output","title":"_format_structured_output(output) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/sentence_transformers.py def _format_structured_output(self, output: str) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.triplet:\n return {\"positive\": None, \"negative\": None}\n return {\"positive\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration","title":"StructuredGeneration ","text":" Bases: Task Generate structured content for a given instruction using an LLM . StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction . The model_name also returned as part of the output in order to enhance it. Attributes: Name Type Description use_system_prompt bool Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. Input columns - instruction (
str ): The instruction to generate structured content from. - structured_output (
Dict[str, Any] ): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema , where format should be one of json or regex , and the schema should be either the JSON schema or the regex pattern, respectively. Output columns - generation (
str ): The generated text matching the provided schema, if possible. - model_name (
str ): The name of the model used to generate the text. Categories - outlines
- structured-generation
Examples: Generate structured output from a JSON schema: from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n)\n Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines): from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n)\n Source code in src/distilabel/steps/tasks/structured_generation.py class StructuredGeneration(Task):\n \"\"\"Generate structured content for a given `instruction` using an `LLM`.\n\n `StructuredGeneration` is a pre-defined task that defines the `instruction` and the `structured_output`\n as the inputs, and `generation` as the output. This task is used to generate structured content based on\n the input instruction and following the schema provided within the `structured_output` column per each\n `instruction`. The `model_name` also returned as part of the output in order to enhance it.\n\n Attributes:\n use_system_prompt: Whether to use the system prompt in the generation. Defaults to `True`,\n which means that if the column `system_prompt` is defined within the input batch, then\n the `system_prompt` will be used, otherwise, it will be ignored.\n\n Input columns:\n - instruction (`str`): The instruction to generate structured content from.\n - structured_output (`Dict[str, Any]`): The structured_output to generate structured content from. It should be a\n Python dictionary with the keys `format` and `schema`, where `format` should be one of `json` or\n `regex`, and the `schema` should be either the JSON schema or the regex pattern, respectively.\n\n Output columns:\n - generation (`str`): The generated text matching the provided schema, if possible.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - outlines\n - structured-generation\n\n Examples:\n Generate structured output from a JSON schema:\n\n ```python\n from distilabel.steps.tasks import StructuredGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n structured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n )\n\n structured_gen.load()\n\n result = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n )\n ```\n\n Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):\n\n ```python\n from distilabel.steps.tasks import StructuredGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n structured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n )\n\n structured_gen.load()\n\n result = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n )\n ```\n \"\"\"\n\n use_system_prompt: bool = False\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `instruction` and the `structured_output`.\n Optionally, if the `use_system_prompt` flag is set to True, then the\n `system_prompt` will be used too.\"\"\"\n columns = [\"instruction\", \"structured_output\"]\n if self.use_system_prompt:\n columns = [\"system_prompt\"] + columns\n return columns\n\n def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/structuredgeneration/\",\n )\n\n messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if self.use_system_prompt:\n if \"system_prompt\" in input:\n messages.insert(\n 0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n )\n else:\n warnings.warn(\n \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n UserWarning,\n stacklevel=2,\n )\n\n return (messages, input.get(\"structured_output\", None)) # type: ignore\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`. Note that even\n if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n output i.e. a string without any parsing.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.inputs","title":"inputs: List[str] property ","text":"The input for the task are the instruction and the structured_output . Optionally, if the use_system_prompt flag is set to True, then the system_prompt will be used too. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/structured_generation.py def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/structuredgeneration/\",\n )\n\n messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if self.use_system_prompt:\n if \"system_prompt\" in input:\n messages.insert(\n 0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n )\n else:\n warnings.warn(\n \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n UserWarning,\n stacklevel=2,\n )\n\n return (messages, input.get(\"structured_output\", None)) # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Note that even if the structured_output is defined to produce a JSON schema, this method will return the raw output i.e. a string without any parsing. Source code in src/distilabel/steps/tasks/structured_generation.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`. Note that even\n if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n output i.e. a string without any parsing.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification","title":"TextClassification ","text":" Bases: Task Classifies text into one or more categories or labels. This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference. Input columns - text (
str ): The reference text we want to obtain labels for. Output columns - labels (
Union[str, List[str]] ): The label or list of labels for the text. - model_name (
str ): The name of the model used to generate the label/s. Categories References Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models Attributes: Name Type Description system_prompt Optional[str] A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist. n PositiveInt Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1. context Optional[str] Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task. examples Optional[List[str]] List of examples to help the model understand the task, few shots. available_labels Optional[Union[List[str], Dict[str, str]]] List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions. default_label Optional[Union[str, List[str]]] Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1). Examples: Assigning a sentiment to a text: from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Assigning predefined labels with specified descriptions: from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Free multi label classification without predefined labels: from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Source code in src/distilabel/steps/tasks/text_classification.py class TextClassification(Task):\n r\"\"\"Classifies text into one or more categories or labels.\n\n This task can be used for text classification problems, where the goal is to assign\n one or multiple labels to a given text.\n It uses structured generation as per the reference paper by default,\n it can help to generate more concise labels. See section 4.1 in the reference.\n\n Input columns:\n - text (`str`): The reference text we want to obtain labels for.\n\n Output columns:\n - labels (`Union[str, List[str]]`): The label or list of labels for the text.\n - model_name (`str`): The name of the model used to generate the label/s.\n\n Categories:\n - text-classification\n\n References:\n - [`Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models`](https://arxiv.org/abs/2408.02442)\n\n Attributes:\n system_prompt: A prompt to display to the user before the task starts. Contains a default\n message to make the model behave like a classifier specialist.\n n: Number of labels to generate If only 1 is required, corresponds to a label\n classification problem, if >1 it will intend return the \"n\" labels most representative\n for the text. Defaults to 1.\n context: Context to use when generating the labels. By default contains a generic message,\n but can be used to customize the context for the task.\n examples: List of examples to help the model understand the task, few shots.\n available_labels: List of available labels to choose from when classifying the text, or\n a dictionary with the labels and their descriptions.\n default_label: Default label to use when the text is ambiguous or lacks sufficient information for\n classification. Can be a list in case of multiple labels (n>1).\n\n Examples:\n Assigning a sentiment to a text:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n from distilabel.models import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n\n text_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n )\n # result\n # [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n # 'labels': 'positive',\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Assigning predefined labels with specified descriptions:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n\n text_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n )\n # result\n # [{'text': 'Can you tell me more about your return policy?',\n # 'labels': 'inquiry',\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Free multi label classification without predefined labels:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n\n text_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n )\n # result\n # [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n # 'labels': ['Historical Researcher',\n # 'Cultural Specialist',\n # 'Ethnic Studies Expert'],\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: Optional[str] = (\n \"You are an AI system specialized in generating labels to classify pieces of text. \"\n \"Your sole purpose is to analyze the given text and provide appropriate classification labels.\"\n )\n n: PositiveInt = Field(\n default=1,\n description=\"Number of labels to generate. Defaults to 1.\",\n )\n context: Optional[str] = Field(\n default=\"Generate concise, relevant labels that accurately represent the text's main themes, topics, or categories.\",\n description=\"Context to use when generating the labels.\",\n )\n examples: Optional[List[str]] = Field(\n default=None,\n description=\"List of examples to help the model understand the task, few shots.\",\n )\n available_labels: Optional[Union[List[str], Dict[str, str]]] = Field(\n default=None,\n description=(\n \"List of available labels to choose from when classifying the text, or \"\n \"a dictionary with the labels and their descriptions.\"\n ),\n )\n default_label: Optional[Union[str, List[str]]] = Field(\n default=\"Unclassified\",\n description=(\n \"Default label to use when the text is ambiguous or lacks sufficient information for \"\n \"classification. Can be a list in case of multiple labels (n>1).\"\n ),\n )\n query_title: str = Field(\n default=\"User Query\",\n description=\"Title of the query used to show the example/s to classify.\",\n )\n use_default_structured_output: bool = True\n\n _template: Optional[Template] = PrivateAttr(default=None)\n\n def load(self) -> None:\n super().load()\n self._template = Template(TEXT_CLASSIFICATION_TEMPLATE)\n self._labels_format: str = (\n '\"label\"'\n if self.n == 1\n else \"[\" + \", \".join([f'\"label_{i}\"' for i in range(self.n)]) + \"]\"\n )\n self._labels_message: str = (\n \"Provide the label that best describes the text.\"\n if self.n == 1\n else f\"Provide a list of {self.n} labels that best describe the text.\"\n )\n self._available_labels_message: str = self._get_available_labels_message()\n self._examples: str = self._get_examples_message()\n\n def _get_available_labels_message(self) -> str:\n \"\"\"Prepares the message to display depending on the available labels (if any),\n and whether the labels have a specific context.\n \"\"\"\n if self.available_labels is None:\n return (\n \"Use clear, widely understood terms for labels.\"\n \"Avoid overly specific or obscure labels unless the text demands it.\"\n )\n\n msg = (\n \"## Labeling the user input\\n\"\n \"Use the available labels to classify the user query{label_context}:\\n\"\n \"available_labels = {available_labels}\"\n )\n if isinstance(self.available_labels, list):\n specific_msg = (\n \"[\\n\"\n + indent(\n \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n prefix=\" \" * 4,\n )\n + \"]\"\n )\n return msg.format(label_context=\"\", available_labels=specific_msg)\n\n elif isinstance(self.available_labels, dict):\n specific_msg = \"\"\n for label, description in self.available_labels.items():\n specific_msg += indent(\n f'\"{label}\", # {description}' + \"\\n\", prefix=\" \" * 4\n )\n\n specific_msg = \"[\\n\" + specific_msg + \"]\"\n return msg.format(\n label_context=\". Analyze the context of each label specifically\",\n available_labels=specific_msg,\n )\n\n def _get_examples_message(self) -> str:\n \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n if self.examples is None:\n return \"\"\n\n examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n return (\n \"\\n## Examples\\n\"\n \"Here are some examples to help you understand the task:\\n\"\n f\"{examples_msg}\"\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"labels\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n context=f\"\\n{self.context}\",\n labels_message=self._labels_message,\n available_labels=self._available_labels_message,\n examples=self._examples,\n default_label=self.default_label,\n labels_format=self._labels_format,\n query_title=self.query_title,\n text=input[\"text\"],\n ),\n },\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return self._format_structured_output(output)\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.n > 1:\n\n class MultiLabelSchema(BaseModel):\n labels: List[str]\n\n return MultiLabelSchema.model_json_schema()\n\n class SingleLabelSchema(BaseModel):\n labels: str\n\n return SingleLabelSchema.model_json_schema()\n\n def _format_structured_output(\n self, output: str\n ) -> Dict[str, Union[str, List[str]]]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the `labels`, and either a string or a list of strings with the labels.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.n > 1:\n return {\"labels\": [None for _ in range(self.n)]}\n return {\"labels\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_available_labels_message","title":"_get_available_labels_message() ","text":"Prepares the message to display depending on the available labels (if any), and whether the labels have a specific context. Source code in src/distilabel/steps/tasks/text_classification.py def _get_available_labels_message(self) -> str:\n \"\"\"Prepares the message to display depending on the available labels (if any),\n and whether the labels have a specific context.\n \"\"\"\n if self.available_labels is None:\n return (\n \"Use clear, widely understood terms for labels.\"\n \"Avoid overly specific or obscure labels unless the text demands it.\"\n )\n\n msg = (\n \"## Labeling the user input\\n\"\n \"Use the available labels to classify the user query{label_context}:\\n\"\n \"available_labels = {available_labels}\"\n )\n if isinstance(self.available_labels, list):\n specific_msg = (\n \"[\\n\"\n + indent(\n \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n prefix=\" \" * 4,\n )\n + \"]\"\n )\n return msg.format(label_context=\"\", available_labels=specific_msg)\n\n elif isinstance(self.available_labels, dict):\n specific_msg = \"\"\n for label, description in self.available_labels.items():\n specific_msg += indent(\n f'\"{label}\", # {description}' + \"\\n\", prefix=\" \" * 4\n )\n\n specific_msg = \"[\\n\" + specific_msg + \"]\"\n return msg.format(\n label_context=\". Analyze the context of each label specifically\",\n available_labels=specific_msg,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_examples_message","title":"_get_examples_message() ","text":"Prepares the message to display depending on the examples provided. Source code in src/distilabel/steps/tasks/text_classification.py def _get_examples_message(self) -> str:\n \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n if self.examples is None:\n return \"\"\n\n examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n return (\n \"\\n## Examples\\n\"\n \"Here are some examples to help you understand the task:\\n\"\n f\"{examples_msg}\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_classification.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n context=f\"\\n{self.context}\",\n labels_message=self._labels_message,\n available_labels=self._available_labels_message,\n examples=self._examples,\n default_label=self.default_label,\n labels_format=self._labels_format,\n query_title=self.query_title,\n text=input[\"text\"],\n ),\n },\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_classification.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return self._format_structured_output(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/text_classification.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.n > 1:\n\n class MultiLabelSchema(BaseModel):\n labels: List[str]\n\n return MultiLabelSchema.model_json_schema()\n\n class SingleLabelSchema(BaseModel):\n labels: str\n\n return SingleLabelSchema.model_json_schema()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._format_structured_output","title":"_format_structured_output(output) ","text":"Parses the structured response, which should correspond to a dictionary with the labels , and either a string or a list of strings with the labels. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, Union[str, List[str]]] Formatted output. Source code in src/distilabel/steps/tasks/text_classification.py def _format_structured_output(\n self, output: str\n) -> Dict[str, Union[str, List[str]]]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the `labels`, and either a string or a list of strings with the labels.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.n > 1:\n return {\"labels\": [None for _ in range(self.n)]}\n return {\"labels\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration","title":"ChatGeneration ","text":" Bases: Task Generates text based on a conversation. ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it. Input columns - messages (
List[Dict[Literal[\"role\", \"content\"], str]] ): The messages to generate the follow up completion from. Output columns - generation (
str ): The generated text from the assistant. - model_name (
str ): The model name used to generate the text. Categories Icon :material-chat: Examples: Generate text from a conversation in OpenAI chat format: from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nchat.load()\n\nresult = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'generation': '4',\n# }\n# ]\n Source code in src/distilabel/steps/tasks/text_generation.py class ChatGeneration(Task):\n \"\"\"Generates text based on a conversation.\n\n `ChatGeneration` is a pre-defined task that defines the `messages` as the input\n and `generation` as the output. This task is used to generate text based on a conversation.\n The `model_name` is also returned as part of the output in order to enhance it.\n\n Input columns:\n - messages (`List[Dict[Literal[\"role\", \"content\"], str]]`): The messages to generate the\n follow up completion from.\n\n Output columns:\n - generation (`str`): The generated text from the assistant.\n - model_name (`str`): The model name used to generate the text.\n\n Categories:\n - chat-generation\n\n Icon:\n `:material-chat:`\n\n Examples:\n Generate text from a conversation in OpenAI chat format:\n\n ```python\n from distilabel.steps.tasks import ChatGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n chat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n chat.load()\n\n result = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # 'generation': '4',\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `messages`.\"\"\"\n return [\"messages\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n if not is_openai_format(input[\"messages\"]):\n raise DistilabelUserError(\n \"Input `messages` must be an OpenAI chat-like format conversation. \"\n f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n if input[\"messages\"][-1][\"role\"] != \"user\":\n raise DistilabelUserError(\n \"The last message must be from the user. Please check: \"\n \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n return input[\"messages\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.inputs","title":"inputs: List[str] property ","text":"The input for the task are the messages . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the messages provided are already formatted that way i.e. following the OpenAI chat format. Source code in src/distilabel/steps/tasks/text_generation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n if not is_openai_format(input[\"messages\"]):\n raise DistilabelUserError(\n \"Input `messages` must be an OpenAI chat-like format conversation. \"\n f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n if input[\"messages\"][-1][\"role\"] != \"user\":\n raise DistilabelUserError(\n \"The last message must be from the user. Please check: \"\n \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n return input[\"messages\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_generation.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration","title":"TextGeneration ","text":" Bases: Task Text generation with an LLM given a prompt. TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM. Attributes: Name Type Description system_prompt Union[str, None] The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None . template str The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. columns Union[str, List[str]] A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . use_system_prompt bool DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. Input columns - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a List[str] to be used in the template. Output columns - generation (
str ): The generated text. - model_name (
str ): The name of the model used to generate the text. Categories References - Jinja2 Template Designer Documentation
Examples: Generate text from an instruction: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'your instruction',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'generation',\n# }\n# ]\n Use a custom template to generate text: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n# 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n# }\n# ]\n Few shot learning with different system prompts: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'examples': ['This is an example', 'Another relevant example'],\n# 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'Disable the firewall on the router',\n# }\n# ]\n Source code in src/distilabel/steps/tasks/text_generation.py class TextGeneration(Task):\n \"\"\"Text generation with an `LLM` given a prompt.\n\n `TextGeneration` is a pre-defined task that allows passing a custom prompt using the\n Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n `template` and `columns` attributes one can define a custom prompt and columns expected\n from the text. This task should be good enough for tasks that don't need post-processing\n of the responses generated by the LLM.\n\n Attributes:\n system_prompt: The system prompt to use in the generation. If not provided, then\n it will check if the input row has a column named `system_prompt` and use it.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system\n prompt in the generation. Defaults to `True`, which means that if the column\n `system_prompt` is defined within the input batch, then the `system_prompt`\n will be used, otherwise, it will be ignored.\n\n Input columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `List[str]` to be used in the template.\n\n Output columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - text-generation\n\n References:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n\n Examples:\n Generate text from an instruction:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'your instruction',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'generation',\n # }\n # ]\n ```\n\n Use a custom template to generate text:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n CUSTOM_TEMPLATE = '''Document:\n {{ document }}\n\n Question: {{ question }}\n\n Please provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n '''.rstrip()\n\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n # 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n # }\n # ]\n ```\n\n Few shot learning with different system prompts:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n CUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n {% for example in examples %}\n Example {{ loop.index }}:\n Instruction: {{ example }}\n\n {% endfor %}\n Now, generate a new instruction in a similar style:\n '''.rstrip()\n\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'examples': ['This is an example', 'Another relevant example'],\n # 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'Disable the firewall on the router',\n # }\n # ]\n ```\n \"\"\"\n\n system_prompt: Union[str, None] = None\n use_system_prompt: bool = Field(default=True, deprecated=True)\n template: str = Field(\n default=\"{{ instruction }}\",\n description=(\n \"This is a template or prompt to use for the generation. \"\n \"If not provided, it is assumed a `instruction` is placed in the inputs, \"\n \"to be used as is.\"\n ),\n )\n columns: Union[str, List[str]] = Field(\n default=\"instruction\",\n description=(\n \"Custom column or list of columns to include in the input. \"\n \"If a `template` is provided which needs custom column names, \"\n \"then they should be provided here. By default it will use `instruction`.\"\n ),\n )\n\n _can_be_used_with_offline_batch_generation = True\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n\n def model_post_init(self, __context: Any) -> None:\n self.columns = [self.columns] if isinstance(self.columns, str) else self.columns\n super().model_post_init(__context)\n\n def load(self) -> None:\n super().load()\n\n for column in self.columns:\n check_column_in_template(column, self.template)\n\n self._template = Template(self.template)\n\n def unload(self) -> None:\n super().unload()\n self._template = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The input for the task is the `instruction` by default, or the `columns` given as input.\"\"\"\n columns = {column: True for column in self.columns}\n columns[\"system_prompt\"] = False\n return columns\n\n def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n # Handle the previous expected errors, in case of custom columns there's more freedom\n # and we cannot check it so easily.\n if self.columns == [\"instruction\"]:\n if is_openai_format(input[\"instruction\"]):\n raise DistilabelUserError(\n \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n messages = self._prepare_message_content(input)\n\n row_system_prompt = input.get(\"system_prompt\")\n if row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n if self.system_prompt and not row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.inputs","title":"inputs: StepColumns property ","text":"The input for the task is the instruction by default, or the columns given as input. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration._prepare_message_content","title":"_prepare_message_content(input) ","text":"Prepares the content for the template and returns the formatted messages. Source code in src/distilabel/steps/tasks/text_generation.py def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_generation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n # Handle the previous expected errors, in case of custom columns there's more freedom\n # and we cannot check it so easily.\n if self.columns == [\"instruction\"]:\n if is_openai_format(input[\"instruction\"]):\n raise DistilabelUserError(\n \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n messages = self._prepare_message_content(input)\n\n row_system_prompt = input.get(\"system_prompt\")\n if row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n if self.system_prompt and not row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_generation.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage","title":"TextGenerationWithImage ","text":" Bases: TextGeneration Text generation with images with an LLM given a prompt. `TextGenerationWithImage` is a pre-defined task that allows passing a custom prompt using the\nJinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n`template` and `columns` attributes one can define a custom prompt and columns expected\nfrom the text. Additionally, an `image` column is expected containing one of the\nurl, base64 encoded image or PIL image. This task inherits from `TextGeneration`,\nso all the functionality available in that task related to the prompt will be available\nhere too.\n\nAttributes:\n system_prompt: The system prompt to use in the generation.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n image_type: The type of the image provided, this will be used to preprocess if necessary.\n Must be one of \"url\", \"base64\" or \"PIL\".\n\nInput columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `list[str]` to be used in the template.\n - image: The column containing the image URL, base64 encoded image or PIL image.\n\nOutput columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\nCategories:\n - text-generation\n\nReferences:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n - [Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text)\n - [OpenAI Vision](https://platform.openai.com/docs/guides/vision)\n\nExamples:\n Answer questions from an image:\n\n ```python\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"What\u2019s in this image?\",\n # \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n # \"generation\": \"Based on the visual cues in the image...\",\n # \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n # ... # distilabel_metadata would be here\n # }\n # ]\n # result[0][\"generation\"]\n # \"Based on the visual cues in the image, here are some possible story points:\n - The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.
Analysis and Ideas: * The abundance of green grass and trees suggests a healthy ecosystem or habitat. * The presence of wildlife, such as birds or deer, is possible based on the surroundings. * A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest. Additional Questions to Ask: * Why is a footbridge present in this area? * What kind of wildlife inhabits this region\" Answer questions from an image stored as base64:\n\n```python\n# For this example we will assume that we have the string representation of the image\n# stored, but will just take the image and transform it to base64 to ilustrate the example.\nimport requests\nimport base64\n\nimage_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimg = requests.get(image_url).content\nbase64_image = base64.b64encode(img).decode(\"utf-8\")\n\nfrom distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n)\n Source code in src/distilabel/steps/tasks/text_generation_with_image.py class TextGenerationWithImage(TextGeneration):\n \"\"\"Text generation with images with an `LLM` given a prompt.\n\n `TextGenerationWithImage` is a pre-defined task that allows passing a custom prompt using the\n Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n `template` and `columns` attributes one can define a custom prompt and columns expected\n from the text. Additionally, an `image` column is expected containing one of the\n url, base64 encoded image or PIL image. This task inherits from `TextGeneration`,\n so all the functionality available in that task related to the prompt will be available\n here too.\n\n Attributes:\n system_prompt: The system prompt to use in the generation.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n image_type: The type of the image provided, this will be used to preprocess if necessary.\n Must be one of \"url\", \"base64\" or \"PIL\".\n\n Input columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `list[str]` to be used in the template.\n - image: The column containing the image URL, base64 encoded image or PIL image.\n\n Output columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - text-generation\n\n References:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n - [Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text)\n - [OpenAI Vision](https://platform.openai.com/docs/guides/vision)\n\n Examples:\n Answer questions from an image:\n\n ```python\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"What\\u2019s in this image?\",\n # \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n # \"generation\": \"Based on the visual cues in the image...\",\n # \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n # ... # distilabel_metadata would be here\n # }\n # ]\n # result[0][\"generation\"]\n # \"Based on the visual cues in the image, here are some possible story points:\\n\\n* The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.\\n\\nAnalysis and Ideas:\\n* The abundance of green grass and trees suggests a healthy ecosystem or habitat.\\n* The presence of wildlife, such as birds or deer, is possible based on the surroundings.\\n* A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest.\\n\\nAdditional Questions to Ask:\\n* Why is a footbridge present in this area?\\n* What kind of wildlife inhabits this region\"\n ```\n\n Answer questions from an image stored as base64:\n\n ```python\n # For this example we will assume that we have the string representation of the image\n # stored, but will just take the image and transform it to base64 to ilustrate the example.\n import requests\n import base64\n\n image_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\n img = requests.get(image_url).content\n base64_image = base64.b64encode(img).decode(\"utf-8\")\n\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n )\n ```\n \"\"\"\n\n image_type: Literal[\"url\", \"base64\", \"PIL\"] = Field(\n default=\"url\",\n description=\"The type of the image provided, this will be used to preprocess if necessary.\",\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n columns = super().inputs\n columns[\"image\"] = True\n return columns\n\n def load(self) -> None:\n Task.load(self)\n\n for column in self.columns:\n check_column_in_template(\n column, self.template, page=\"components-gallery/tasks/visiongeneration/\"\n )\n\n self._template = Template(self.template)\n\n def _transform_image(self, image: Union[str, \"Image\"]) -> str:\n \"\"\"Transforms the image based on the `image_type` attribute.\"\"\"\n if self.image_type == \"url\":\n return image\n\n if self.image_type == \"base64\":\n return f\"data:image/jpeg;base64,{image}\"\n\n # Othwerwise, it's a PIL image\n return f\"data:image/jpeg;base64,{image_to_str(image)}\"\n\n def _prepare_message_content(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n img_url = self._transform_image(input[\"image\"])\n return [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"text\",\n \"text\": self._template.render(**fields),\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": img_url,\n },\n },\n ],\n }\n ]\n\n def format_input(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = self._prepare_message_content(input)\n\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage._transform_image","title":"_transform_image(image) ","text":"Transforms the image based on the image_type attribute. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def _transform_image(self, image: Union[str, \"Image\"]) -> str:\n \"\"\"Transforms the image based on the `image_type` attribute.\"\"\"\n if self.image_type == \"url\":\n return image\n\n if self.image_type == \"base64\":\n return f\"data:image/jpeg;base64,{image}\"\n\n # Othwerwise, it's a PIL image\n return f\"data:image/jpeg;base64,{image_to_str(image)}\"\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage._prepare_message_content","title":"_prepare_message_content(input) ","text":"Prepares the content for the template and returns the formatted messages. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def _prepare_message_content(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n img_url = self._transform_image(input[\"image\"])\n return [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"text\",\n \"text\": self._template.render(**fields),\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": img_url,\n },\n },\n ],\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def format_input(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = self._prepare_message_content(input)\n\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback","title":"UltraFeedback ","text":" Bases: Task Rank generations focusing on different aspects using an LLM . UltraFeedback: Boosting Language Models with High-quality Feedback. Attributes: Name Type Description aspect Literal['helpfulness', 'honesty', 'instruction-following', 'truthfulness', 'overall-rating'] The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness : Evaluate text outputs based on helpfulness. - honesty : Evaluate text outputs based on honesty. - instruction-following : Evaluate text outputs based on given instructions. - truthfulness : Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating : Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\" . Input columns - instruction (
str ): The reference instruction to evaluate the text outputs. - generations (
List[str] ): The text outputs to evaluate for the given instruction. Output columns - ratings (
List[float] ): The ratings for each of the provided text outputs. - rationales (
List[str] ): The rationales for each of the provided text outputs. - model_name (
str ): The name of the model used to generate the ratings and rationales. Categories References UltraFeedback: Boosting Language Models with High-quality Feedback UltraFeedback - GitHub Repository Examples: Rate generations from different LLMs based on the selected aspect: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 2],\n# 'rationales': ['explanation for 4', 'explanation for and a car'],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# }\n# ]\n Rate generations from different LLMs based on the honesty, using the default structured output: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n 5,\\n 1\\n] \\n\\n,\"rationales\": [\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Rate generations from different LLMs based on the helpfulness, using the default structured output: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 5],\n# 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n# 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n# 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n# 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n# 'types': [1, 3, 1],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n \"ratings\": [\\n 1,\\n 5\\n ]\\n ,\\n \"rationales\": [\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n ]\\n ,\\n \"rationales_for_rating\": [\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n ]\\n ,\\n \"types\": [\\n 1, 3,\\n 1\\n ]\\n }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations @misc{cui2024ultrafeedbackboostinglanguagemodels,\n title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n year={2024},\n eprint={2310.01377},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.01377},\n}\n Source code in src/distilabel/steps/tasks/ultrafeedback.py class UltraFeedback(Task):\n \"\"\"Rank generations focusing on different aspects using an `LLM`.\n\n UltraFeedback: Boosting Language Models with High-quality Feedback.\n\n Attributes:\n aspect: The aspect to perform with the `UltraFeedback` model. The available aspects are:\n - `helpfulness`: Evaluate text outputs based on helpfulness.\n - `honesty`: Evaluate text outputs based on honesty.\n - `instruction-following`: Evaluate text outputs based on given instructions.\n - `truthfulness`: Evaluate text outputs based on truthfulness.\n Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall\n assessment of the text outputs within a single prompt. The custom aspect is:\n - `overall-rating`: Evaluate text outputs based on an overall assessment.\n Defaults to `\"overall-rating\"`.\n\n Input columns:\n - instruction (`str`): The reference instruction to evaluate the text outputs.\n - generations (`List[str]`): The text outputs to evaluate for the given instruction.\n\n Output columns:\n - ratings (`List[float]`): The ratings for each of the provided text outputs.\n - rationales (`List[str]`): The rationales for each of the provided text outputs.\n - model_name (`str`): The name of the model used to generate the ratings and rationales.\n\n Categories:\n - preference\n\n References:\n - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)\n - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)\n\n Examples:\n Rate generations from different LLMs based on the selected aspect:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [1, 2],\n # 'rationales': ['explanation for 4', 'explanation for and a car'],\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # }\n # ]\n ```\n\n Rate generations from different LLMs based on the honesty, using the default structured output:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [{'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [5, 1],\n # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n # \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\\\n 5,\\\\n 1\\\\n] \\\\n\\\\n,\"rationales\": [\\\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\\\n \"The response is confidently incorrect, as it provides unrelated information (\\'a car\\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\\\n] }'},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Rate generations from different LLMs based on the helpfulness, using the default structured output:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [{'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [1, 5],\n # 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n # 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n # 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n # 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n # 'types': [1, 3, 1],\n # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\\\n \"ratings\": [\\\\n 1,\\\\n 5\\\\n ]\\\\n ,\\\\n \"rationales\": [\\\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\\\n ]\\\\n ,\\\\n \"rationales_for_rating\": [\\\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\\\n ]\\\\n ,\\\\n \"types\": [\\\\n 1, 3,\\\\n 1\\\\n ]\\\\n }'},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n ```\n @misc{cui2024ultrafeedbackboostinglanguagemodels,\n title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n year={2024},\n eprint={2310.01377},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.01377},\n }\n ```\n \"\"\"\n\n aspect: Literal[\n \"helpfulness\",\n \"honesty\",\n \"instruction-following\",\n \"truthfulness\",\n # Custom aspects\n \"overall-rating\",\n ] = \"overall-rating\"\n\n _system_prompt: str = PrivateAttr(\n default=(\n \"Your role is to evaluate text quality based on given criteria.\\n\"\n 'You\\'ll receive an instructional description (\"Instruction\") and {no_texts} text outputs (\"Text\").\\n'\n \"Understand and interpret instructions to evaluate effectively.\\n\"\n \"Provide annotations for each text with a rating and rationale.\\n\"\n \"The {no_texts} texts given are independent, and should be evaluated separately.\\n\"\n )\n )\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"ultrafeedback\"\n / f\"{self.aspect}.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`, and the `generations` for it.\"\"\"\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._system_prompt.format(\n no_texts=len(input[\"generations\"])\n ),\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generations=input[\"generations\"]\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n columns = []\n if self.aspect in [\"honesty\", \"instruction-following\", \"overall-rating\"]:\n columns = [\"ratings\", \"rationales\"]\n elif self.aspect in [\"helpfulness\", \"truthfulness\"]:\n columns = [\"types\", \"rationales\", \"ratings\", \"rationales-for-ratings\"]\n return columns + [\"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n each of the provided `generations` for the given `instruction`. The `model_name`\n will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing either the `ratings` and `rationales` for each of the provided\n `generations` for the given `instruction` if the provided aspect is either `honesty`,\n `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n \"\"\"\n assert input is not None, \"Input is required to format the output.\"\n\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return self._format_ratings_rationales_output(output, input)\n\n return self._format_types_ratings_rationales_output(output, input)\n\n def _format_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n if output is None:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n continue\n\n formatted_outputs.append(\n {\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n }\n )\n return group_dicts(*formatted_outputs)\n\n def _format_types_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n if output is None:\n return {\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append(\n {\n \"types\": None,\n \"rationales\": None,\n \"ratings\": None,\n \"rationales-for-ratings\": None,\n }\n )\n continue\n\n formatted_outputs.append(\n {\n \"types\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n if matches.group(3) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales-for-ratings\": matches.group(4),\n }\n )\n return group_dicts(*formatted_outputs)\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\n class SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"properties\": {\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"ratings\", \"rationales\"],\n \"title\": \"SchemaUltraFeedback\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\n \"types\": {\n \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n \"title\": \"Types\",\n \"type\": \"array\",\n },\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n \"rationales_for_rating\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales For Rating\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n \"title\": \"SchemaUltraFeedbackWithType\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n {\n \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n \"generations\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction , and the generations for it. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.load","title":"load() ","text":"Loads the Jinja2 template for the given aspect . Source code in src/distilabel/steps/tasks/ultrafeedback.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"ultrafeedback\"\n / f\"{self.aspect}.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/ultrafeedback.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._system_prompt.format(\n no_texts=len(input[\"generations\"])\n ),\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generations=input[\"generations\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the ratings and rationales for each of the provided generations for the given instruction . The model_name will be automatically included within the process method of Task . Parameters: Name Type Description Default output Union[str, None] a string representing the output of the LLM via the process method. required input Union[Dict[str, Any], None] the input to the task, as required by some tasks to format the output. None Returns: Type Description Dict[str, Any] A dictionary containing either the ratings and rationales for each of the provided Dict[str, Any] generations for the given instruction if the provided aspect is either honesty , Dict[str, Any] instruction-following , or overall-rating ; or the types , rationales , Dict[str, Any] ratings , and rationales-for-ratings for each of the provided generations for the Dict[str, Any] given instruction if the provided aspect is either helpfulness or truthfulness . Source code in src/distilabel/steps/tasks/ultrafeedback.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n each of the provided `generations` for the given `instruction`. The `model_name`\n will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing either the `ratings` and `rationales` for each of the provided\n `generations` for the given `instruction` if the provided aspect is either `honesty`,\n `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n \"\"\"\n assert input is not None, \"Input is required to format the output.\"\n\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return self._format_ratings_rationales_output(output, input)\n\n return self._format_types_ratings_rationales_output(output, input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_ratings_rationales_output","title":"_format_ratings_rationales_output(output, input) ","text":"Formats the output when the aspect is either honesty , instruction-following , or overall-rating . Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n if output is None:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n continue\n\n formatted_outputs.append(\n {\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n }\n )\n return group_dicts(*formatted_outputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_types_ratings_rationales_output","title":"_format_types_ratings_rationales_output(output, input) ","text":"Formats the output when the aspect is either helpfulness or truthfulness . Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_types_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n if output is None:\n return {\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append(\n {\n \"types\": None,\n \"rationales\": None,\n \"ratings\": None,\n \"rationales-for-ratings\": None,\n }\n )\n continue\n\n formatted_outputs.append(\n {\n \"types\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n if matches.group(3) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales-for-ratings\": matches.group(4),\n }\n )\n return group_dicts(*formatted_outputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\nclass SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/ultrafeedback.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\n class SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"properties\": {\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"ratings\", \"rationales\"],\n \"title\": \"SchemaUltraFeedback\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\n \"types\": {\n \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n \"title\": \"Types\",\n \"type\": \"array\",\n },\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n \"rationales_for_rating\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales For Rating\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n \"title\": \"SchemaUltraFeedbackWithType\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, Any] Formatted output. Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL","title":"URIAL ","text":" Bases: Task Generates a response using a non-instruct fine-tuned model. URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input. Input columns - instruction (
str , optional): The instruction to generate a response from. - conversation (
List[Dict[str, str]] , optional): The conversation to generate a response from (the last message must be from the user). Output columns - generation (
str ): The generated response. - model_name (
str ): The name of the model used to generate the response. Categories References - The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
Examples: Generate text from an instruction: from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n)\n\nstep.load()\n\nresults = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n# {\n# 'instruction': \"What's the most most common type of cloud?\",\n# 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n# 'distilabel_metadata': {...},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n# }\n# ]\n Source code in src/distilabel/steps/tasks/urial.py class URIAL(Task):\n \"\"\"Generates a response using a non-instruct fine-tuned model.\n\n `URIAL` is a pre-defined task that generates a response using a non-instruct fine-tuned\n model. This task is used to generate a response based on the conversation provided as\n input.\n\n Input columns:\n - instruction (`str`, optional): The instruction to generate a response from.\n - conversation (`List[Dict[str, str]]`, optional): The conversation to generate\n a response from (the last message must be from the user).\n\n Output columns:\n - generation (`str`): The generated response.\n - model_name (`str`): The name of the model used to generate the response.\n\n Categories:\n - text-generation\n\n References:\n - [The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning](https://arxiv.org/abs/2312.01552)\n\n Examples:\n Generate text from an instruction:\n\n ```python\n from distilabel.models import vLLM\n from distilabel.steps.tasks import URIAL\n\n step = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n )\n\n step.load()\n\n results = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n )\n # [\n # {\n # 'instruction': \"What's the most most common type of cloud?\",\n # 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n # 'distilabel_metadata': {...},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n # }\n # ]\n ```\n \"\"\"\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"urial.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> \"StepColumns\":\n return {\"instruction\": False, \"conversation\": False}\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = (\n [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if \"instruction\" in input\n else input[\"conversation\"]\n )\n\n if messages[-1][\"role\"] != \"user\":\n raise ValueError(\"The last message must be from the user.\")\n\n return [{\"role\": \"user\", \"content\": self._template.render(messages=messages)}]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n if output is None:\n return {\"generation\": None}\n\n response = output.split(\"\\n\\n# User\")[0]\n if response.startswith(\"\\n\\n\"):\n response = response[2:]\n response = response.strip()\n\n return {\"generation\": response}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL.load","title":"load() ","text":"Loads the Jinja2 template for the given aspect . Source code in src/distilabel/steps/tasks/urial.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"urial.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.task","title":"task(inputs=None, outputs=None) ","text":"Creates a Task from a formatting output function. Parameters: Name Type Description Default inputs Union[StepColumns, None] a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None outputs Union[StepColumns, None] a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None Source code in src/distilabel/steps/tasks/decorator.py def task(\n inputs: Union[\"StepColumns\", None] = None,\n outputs: Union[\"StepColumns\", None] = None,\n) -> Callable[..., Type[\"Task\"]]:\n \"\"\"Creates a `Task` from a formatting output function.\n\n Args:\n inputs: a list containing the name of the inputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column is required or not, that are required by the step. If not provided\n the default will be an empty list `[]` and it will be assumed that the step\n doesn't need any specific columns. Defaults to `None`.\n outputs: a list containing the name of the outputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column will be generated or not. If not provided the default will be an\n empty list `[]` and it will be assumed that the step doesn't need any specific\n columns. Defaults to `None`.\n \"\"\"\n\n inputs = inputs or []\n outputs = outputs or []\n\n def decorator(func: TaskFormattingOutputFunc) -> Type[\"Task\"]:\n doc = inspect.getdoc(func)\n if doc is None:\n raise DistilabelUserError(\n \"When using the `task` decorator, including a docstring in the formatting\"\n \" function is mandatory. The docstring must follow the format described\"\n \" in the documentation.\",\n page=\"\",\n )\n\n system_prompt, user_message_template = _parse_docstring(doc)\n _validate_templates(inputs, system_prompt, user_message_template)\n\n def inputs_property(self) -> \"StepColumns\":\n return inputs\n\n def outputs_property(self) -> \"StepColumns\":\n return outputs\n\n def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n return [\n {\"role\": \"system\", \"content\": system_prompt.format(**input)},\n {\"role\": \"user\", \"content\": user_message_template.format(**input)},\n ]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n return func(output, input)\n\n return type(\n func.__name__,\n (Task,),\n {\n \"inputs\": property(inputs_property),\n \"outputs\": property(outputs_property),\n \"__module__\": func.__module__,\n \"format_input\": format_input,\n \"format_output\": format_output,\n },\n )\n\n return decorator\n "},{"location":"api/task/typing/","title":"Task Typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing","title":"typing ","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ChatType","title":"ChatType = List[ChatItem] module-attribute ","text":"ChatType is a type alias for a list of dict s following the OpenAI conversational format. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredOutputType","title":"StructuredOutputType = Union[OutlinesStructuredOutputType, InstructorStructuredOutputType] module-attribute ","text":"StructuredOutputType is an alias for the union of OutlinesStructuredOutputType and InstructorStructuredOutputType . "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StandardInput","title":"StandardInput = ChatType module-attribute ","text":"StandardInput is an alias for ChatType that defines the default / standard input produced by format_input . "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredInput","title":"StructuredInput = Tuple[StandardInput, Union[StructuredOutputType, None]] module-attribute ","text":"StructuredInput defines a type produced by format_input when using either StructuredGeneration or a subclass of it. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.FormattedInput","title":"FormattedInput = Union[StandardInput, StructuredInput, ChatType] module-attribute ","text":"FormattedInput is an alias for the union of StandardInput and StructuredInput as generated by format_input and expected by the LLM s, as well as ConversationType for the vision language models. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageUrl","title":"ImageUrl ","text":" Bases: TypedDict Source code in src/distilabel/steps/tasks/typing.py class ImageUrl(TypedDict):\n url: Required[str]\n \"\"\"Either a URL of the image or the base64 encoded image data.\"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageUrl.url","title":"url: Required[str] instance-attribute ","text":"Either a URL of the image or the base64 encoded image data. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageContent","title":"ImageContent ","text":" Bases: TypedDict Type alias for the user's message in a conversation that can include text or an image. It's the standard type for vision language models: https://platform.openai.com/docs/guides/vision Source code in src/distilabel/steps/tasks/typing.py class ImageContent(TypedDict, total=False):\n \"\"\"Type alias for the user's message in a conversation that can include text or an image.\n It's the standard type for vision language models:\n https://platform.openai.com/docs/guides/vision\n \"\"\"\n\n type: Required[Literal[\"image_url\"]]\n image_url: Required[ImageUrl]\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType","title":"OutlinesStructuredOutputType ","text":" Bases: TypedDict TypedDict to represent the structured output configuration from outlines . Source code in src/distilabel/steps/tasks/typing.py class OutlinesStructuredOutputType(TypedDict, total=False):\n \"\"\"TypedDict to represent the structured output configuration from `outlines`.\"\"\"\n\n format: Literal[\"json\", \"regex\"]\n \"\"\"One of \"json\" or \"regex\".\"\"\"\n schema: Union[str, Type[BaseModel], Dict[str, Any]]\n \"\"\"The schema to use for the structured output. If \"json\", it\n can be a pydantic.BaseModel class, or the schema as a string,\n as obtained from `model_to_schema(BaseModel)`, if \"regex\", it\n should be a regex pattern as a string.\n \"\"\"\n whitespace_pattern: Optional[Union[str, List[str]]]\n \"\"\"If \"json\" corresponds to a string or a list of\n strings with a pattern (doesn't impact string literals).\n For example, to allow only a single space or newline with\n `whitespace_pattern=r\"[\\n ]?\"`\n \"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.format","title":"format: Literal['json', 'regex'] instance-attribute ","text":"One of \"json\" or \"regex\". "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.schema","title":"schema: Union[str, Type[BaseModel], Dict[str, Any]] instance-attribute ","text":"The schema to use for the structured output. If \"json\", it can be a pydantic.BaseModel class, or the schema as a string, as obtained from model_to_schema(BaseModel) , if \"regex\", it should be a regex pattern as a string. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.whitespace_pattern","title":"whitespace_pattern: Optional[Union[str, List[str]]] instance-attribute ","text":"If \"json\" corresponds to a string or a list of strings with a pattern (doesn't impact string literals). For example, to allow only a single space or newline with whitespace_pattern=r\"[ ]?\" "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType","title":"InstructorStructuredOutputType ","text":" Bases: TypedDict TypedDict to represent the structured output configuration from instructor . Source code in src/distilabel/steps/tasks/typing.py class InstructorStructuredOutputType(TypedDict, total=False):\n \"\"\"TypedDict to represent the structured output configuration from `instructor`.\"\"\"\n\n format: Optional[Literal[\"json\"]]\n \"\"\"One of \"json\".\"\"\"\n schema: Union[Type[BaseModel], Dict[str, Any]]\n \"\"\"The schema to use for the structured output, a `pydantic.BaseModel` class. \"\"\"\n mode: Optional[str]\n \"\"\"Generation mode. Take a look at `instructor.Mode` for more information, if not informed it will\n be determined automatically. \"\"\"\n max_retries: int\n \"\"\"Number of times to reask the model in case of error, if not set will default to the model's default. \"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.format","title":"format: Optional[Literal['json']] instance-attribute ","text":"One of \"json\". "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.schema","title":"schema: Union[Type[BaseModel], Dict[str, Any]] instance-attribute ","text":"The schema to use for the structured output, a pydantic.BaseModel class. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.mode","title":"mode: Optional[str] instance-attribute ","text":"Generation mode. Take a look at instructor.Mode for more information, if not informed it will be determined automatically. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.max_retries","title":"max_retries: int instance-attribute ","text":"Number of times to reask the model in case of error, if not set will default to the model's default. "},{"location":"sections/community/","title":"Community","text":"We are an open-source community-driven project not only focused on building a great product but also on building a great community, where you can get support, share your experiences, and contribute to the project! We would love to hear from you and help you get started with distilabel. -
Discord In our Discord channels (#argilla-general and #argilla-help), you can get direct support from the community. Discord \u2197 -
Community Meetup We host bi-weekly community meetups where you can listen in or present your work. Community Meetup \u2197 -
Changelog The changelog is where you can find the latest updates and changes to the distilabel project. Changelog \u2197 -
Roadmap We love to discuss our plans with the community. Feel encouraged to participate in our roadmap discussions. Roadmap \u2197 "},{"location":"sections/community/#badges","title":"Badges","text":"If you build something cool with distilabel consider adding one of these badges to your dataset or model card. [<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-light.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n [<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-dark.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n "},{"location":"sections/community/#contribute","title":"Contribute","text":"To directly contribute with distilabel , check our good first issues or open a new one. "},{"location":"sections/community/contributor/","title":"How to contribute?","text":"Thank you for investing your time in contributing to the project! Any contribution you make will be reflected in the most recent version of distilabel \ud83e\udd29. New to contributing in general? If you're a new contributor, read the README to get an overview of the project. In addition, here are some resources to help you get started with open-source contributions: - Discord: You are welcome to join the distilabel Discord community, where you can keep in touch with other users, contributors and the distilabel team. In the following section, you can find more information on how to get started in Discord.
- Git: This is a very useful tool to keep track of the changes in your files. Using the command-line interface (CLI), you can make your contributions easily. For that, you need to have it installed and updated on your computer.
- GitHub: It is a platform and cloud-based service that uses git and allows developers to collaborate on projects. To contribute to distilabel, you'll need to create an account. Check the Contributor Workflow with Git and Github for more info.
- Developer Documentation: To collaborate, you'll need to set up an efficient environment. Check the Installation guide to know how to do it.
"},{"location":"sections/community/contributor/#first-contact-in-discord","title":"First Contact in Discord","text":"Discord is a handy tool for more casual conversations and to answer day-to-day questions. As part of Hugging Face, we have set up some distilabel channels on the server. Click here to join the Hugging Face Discord community effortlessly. When part of the Hugging Face Discord, you can select \"Channels & roles\" and select \"Argilla\" along with any of the other groups that are interesting to you. \"Argilla\" will cover anything about argilla and distilabel. You can join the following channels: - #argilla-distilabel-announcements: \ud83d\udce3 Stay up-to-date.
- #argilla-distilabel-general: \ud83d\udcac For general discussions.
- #argilla-distilabel-help: \ud83d\ude4b\u200d\u2640\ufe0f Need assistance? We're always here to help. Select the appropriate label (argilla or distilabel) for your issue and post it.
So now there is only one thing left to do: introduce yourself and talk to the community. You'll always be welcome! \ud83e\udd17\ud83d\udc4b "},{"location":"sections/community/contributor/#contributor-workflow-with-git-and-github","title":"Contributor Workflow with Git and GitHub","text":"If you're working with distilabel and suddenly a new idea comes to your mind or you find an issue that can be improved, it's time to actively participate and contribute to the project! "},{"location":"sections/community/contributor/#report-an-issue","title":"Report an issue","text":"If you spot a problem, search if an issue already exists, you can use the Label filter. If that is the case, participate in the conversation. If it does not exist, create an issue by clicking on New Issue . This will show various templates; choose the one that best suits your issue. Once you choose one, you will need to fill it in following the guidelines. Try to be as clear as possible. In addition, you can assign yourself to the issue and add or choose the right labels. Finally, click on Submit new issue . "},{"location":"sections/community/contributor/#work-with-a-fork","title":"Work with a fork","text":""},{"location":"sections/community/contributor/#fork-the-distilabel-repository","title":"Fork the distilabel repository","text":"After having reported the issue, you can start working on it. For that, you will need to create a fork of the project. To do that, click on the Fork button. Now, fill in the information. Remember to uncheck the Copy develop branch only if you are going to work in or from another branch (for instance, to fix documentation, the main branch is used). Then, click on Create fork . You will be redirected to your fork. You can see that you are in your fork because the name of the repository will be your username/distilabel , and it will indicate forked from argilla-io/distilabel . "},{"location":"sections/community/contributor/#clone-your-forked-repository","title":"Clone your forked repository","text":"In order to make the required adjustments, clone the forked repository to your local machine. Choose the destination folder and run the following command: git clone https://github.com/[your-github-username]/distilabel.git\ncd distilabel\n To keep your fork\u2019s main/develop branch up to date with our repo, add it as an upstream remote branch. git remote add upstream https://github.com/argilla-io/distilabel.git\n "},{"location":"sections/community/contributor/#create-a-new-branch","title":"Create a new branch","text":"For each issue you're addressing, it's advisable to create a new branch. GitHub offers a straightforward method to streamline this process. \u26a0\ufe0f Never work directly on the main or develop branch. Always create a new branch for your changes. Navigate to your issue, and on the right column, select Create a branch . After the new window pops up, the branch will be named after the issue and include a prefix such as feature/, bug/, or docs/ to facilitate quick recognition of the issue type. In the Repository destination , pick your fork ( [your-github-username]/distilabel), and then select Change branch source to specify the source branch for creating the new one. Complete the process by clicking Create branch . \ud83e\udd14 Remember that the main branch is only used to work with the documentation. For any other changes, use the develop branch. Now, locally, change to the new branch you just created. git fetch origin\ngit checkout [branch-name]\n "},{"location":"sections/community/contributor/#make-changes-and-push-them","title":"Make changes and push them","text":"Make the changes you want in your local repository, and test that everything works and you are following the guidelines. Once you have finished, you can check the status of your repository and synchronize with the upstreaming repo with the following command: # Check the status of your repository\ngit status\n\n# Synchronize with the upstreaming repo\ngit checkout [branch-name]\ngit rebase [default-branch]\n If everything is right, we need to commit and push the changes to your fork. For that, run the following commands: # Add the changes to the staging area\ngit add filename\n\n# Commit the changes by writing a proper message\ngit commit -m \"commit-message\"\n\n# Push the changes to your fork\ngit push origin [branch-name]\n When pushing, you will be asked to enter your GitHub login credentials. Once the push is complete, all local commits will be on your GitHub repository. "},{"location":"sections/community/contributor/#create-a-pull-request","title":"Create a pull request","text":"Come back to GitHub, navigate to the original repository where you created your fork, and click on Compare & pull request . First, click on compare across forks and select the right repositories and branches. In the base repository, keep in mind that you should select either main or develop based on the modifications made. In the head repository, indicate your forked repository and the branch corresponding to the issue. Then, fill in the pull request template. You should add a prefix to the PR name, as we did with the branch above. If you are working on a new feature, you can name your PR as feat: TITLE . If your PR consists of a solution for a bug, you can name your PR as bug: TITLE . And, if your work is for improving the documentation, you can name your PR as docs: TITLE . In addition, on the right side, you can select a reviewer (for instance, if you discussed the issue with a member of the team) and assign the pull request to yourself. It is highly advisable to add labels to PR as well. You can do this again by the labels section right on the screen. For instance, if you are addressing a bug, add the bug label, or if the PR is related to the documentation, add the documentation label. This way, PRs can be easily filtered. Finally, fill in the template carefully and follow the guidelines. Remember to link the original issue and enable the checkbox to allow maintainer edits so the branch can be updated for a merge. Then, click on Create pull request . For the PR body, ensure you give a description of what the PR contains, and add examples if possible (and if they apply to the contribution) to help with the review process. You can take a look at #PR 974 or #PR 983 for examples of typical PRs. "},{"location":"sections/community/contributor/#review-your-pull-request","title":"Review your pull request","text":"Once you submit your PR, a team member will review your proposal. We may ask questions, request additional information, or ask for changes to be made before a PR can be merged, either using suggested changes or pull request comments. You can apply the changes directly through the UI (check the files changed and click on the right-corner three dots; see image below) or from your fork, and then commit them to your branch. The PR will be updated automatically, and the suggestions will appear as outdated . If you run into any merge issues, check out this git tutorial to help you resolve merge conflicts and other issues. "},{"location":"sections/community/contributor/#your-pr-is-merged","title":"Your PR is merged!","text":"Congratulations \ud83c\udf89\ud83c\udf8a We thank you \ud83e\udd29 Once your PR is merged, your contributions will be publicly visible on the distilabel GitHub. Additionally, we will include your changes in the next release based on our development branch. "},{"location":"sections/community/contributor/#additional-resources","title":"Additional resources","text":"Here are some helpful resources for your reference. - Configuring Discord, a guide to learning how to get started with Discord.
- Pro Git, a book to learn Git.
- Git in VSCode, a guide to learning how to easily use Git in VSCode.
- GitHub Skills, an interactive course for learning GitHub.
"},{"location":"sections/community/developer_documentation/","title":"Developer Documentation","text":"Thank you for investing your time in contributing to the project! If you don't have the repository locally, and need any help, go to the contributor guide and read the contributor workflow with Git and GitHub first. "},{"location":"sections/community/developer_documentation/#set-up-the-python-environment","title":"Set up the Python environment","text":"To work on the distilabel , you must install the package on your system. Tip This guide will use uv , but pip and venv can be used as well, this guide can work quite similar with both options. From the root of the cloned Distilabel repository, you should move to the distilabel folder in your terminal. cd distilabel\n "},{"location":"sections/community/developer_documentation/#create-a-virtual-environment","title":"Create a virtual environment","text":"The first step will be creating a virtual environment to keep our dependencies isolated. Here we are choosing python 3.11 (uv venv documentation), and then activate it: uv venv .venv --python 3.11\nsource .venv/bin/activate\n "},{"location":"sections/community/developer_documentation/#install-the-project","title":"Install the project","text":"Installing from local (we are using uv pip ): uv pip install -e .\n We have extra dependencies with their name, depending on the part you are working on, you may want to install some dependency (take a look at pyproject.toml in the repo to see all the extra dependencies): uv pip install -e \".[vllm,outlines]\"\n "},{"location":"sections/community/developer_documentation/#linting-and-formatting","title":"Linting and formatting","text":"To maintain a consistent code format, install the pre-commit hooks to run before each commit automatically (we rely heavily on ruff ): uv pip install -e \".[dev]\"\npre-commit install\n "},{"location":"sections/community/developer_documentation/#running-tests","title":"Running tests","text":"All the changes you add to the codebase should come with tests, either unit or integration tests, depending on the type of change, which are placed under tests/unit and tests/integration respectively. Start by installing the tests dependencies: uv pip install \".[tests]\"\n Running the whole tests suite may take some time, and you will need all the dependencies installed, so just run your tests, and the whole tests suite will be run for you in the CI: # Run specific tests\npytest tests/unit/steps/generators/test_data.py\n "},{"location":"sections/community/developer_documentation/#set-up-the-documentation","title":"Set up the documentation","text":"To contribute to the documentation and generate it locally, ensure you have installed the development dependencies: uv pip install -e \".[docs]\"\n And run the following command to create the development server with mkdocs : mkdocs serve\n "},{"location":"sections/community/developer_documentation/#documentation-guidelines","title":"Documentation guidelines","text":"As mentioned, we use mkdocs to build the documentation. You can write the documentation in markdown format, and it will automatically be converted to HTML. In addition, you can include elements such as tables, tabs, images, and others, as shown in this guide. We recommend following these guidelines: -
Use clear and concise language: Ensure the documentation is easy to understand for all users by using straightforward language and including meaningful examples. Images are not easy to maintain, so use them only when necessary and place them in the appropriate folder within the docs/assets/images directory. -
Verify code snippets: Double-check that all code snippets are correct and runnable. -
Review spelling and grammar: Check the spelling and grammar of the documentation. -
Update the table of contents: If you add a new page, include it in the relevant index.md or the mkdocs.yml file. "},{"location":"sections/community/developer_documentation/#components-gallery","title":"Components gallery","text":"The components gallery section of the documentation is automatically generated thanks to a custom plugin, it will be run when mkdocs serve is called. This guide to the steps helps us visualize each step, as well as examples of use. Note Changes done to the docstrings of Steps/Tasks/LLMs won't appear in the components gallery automatically, you will have to stop the mkdocs server and run it again to see the changes, everything else is reloaded automatically. "},{"location":"sections/community/popular_issues/","title":"Issue dashboard","text":"Most engaging open issuesLatest issues open by the communityPlanned issues for upcoming releases Rank Issue Reactions Comments 1 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API \ud83d\udc4d 2 \ud83d\udcac 1 2 995 - [FEATURE] mlx-lm integration \ud83d\udc4d 2 \ud83d\udcac 1 3 737 - [FEATURE] Allow FormatTextGenerationSFT to include tools/function calls in the formatted messages. \ud83d\udc4d 2 \ud83d\udcac 0 4 1001 - [FEATURE] sglang integration \ud83d\udc4d 1 \ud83d\udcac 1 5 797 - [FEATURE] synthetic data generation for predictive NLP tasks \ud83d\udc4d 1 \ud83d\udcac 1 6 914 - [FEATURE] Use Step.resources to set tensor_parallel_size and pipeline_parallel_size in vLLM \ud83d\udc4d 1 \ud83d\udcac 0 7 588 - [FEATURE] Single request caching \ud83d\udc4d 1 \ud83d\udcac 0 8 953 - [EXAMPLE] Add CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through Corpus Retrieval and Augmentation example \ud83d\udc4d 0 \ud83d\udcac 6 9 972 - [BUG] Input data size != output data size when task batch size < batch size of predecessor \ud83d\udc4d 0 \ud83d\udcac 4 10 859 - [FEATURE] Update PushToHub to stream data to the Hub \ud83d\udc4d 0 \ud83d\udcac 4 Rank Issue Author 1 \ud83d\udfe2 1070 - [BUG] Pipeline serialization/caching issue when including RoutingBatchFunction by liamcripwell 2 \ud83d\udfe2 1068 - [BUG] GenerateSentencePair(...) always returns None positive and negative pairs by caesar-one 3 \ud83d\udfe2 1064 - [DOCS] Update basic guides of steps and tasks by plaguss 4 \ud83d\udfe2 1058 - [FEATURE] Implement a rate limiter for API calls by plaguss 5 \ud83d\udfe3 1056 - [DOCS] The example on how to use a Step no longer works by wwymak 6 \ud83d\udfe3 1049 - [BUG] vLLM Task not utilizing multiple GPUs in parallel when replicas > 1 by adamlin120 7 \ud83d\udfe3 1048 - [BUG] OepnAI JSON format by tinyrolls 8 \ud83d\udfe2 1047 - Failed to load all the steps. Could not run pipeline. by yuqie 9 \ud83d\udfe2 1046 - [FEATURE] Compute the input/output tokens of a dataset by plaguss 10 \ud83d\udfe3 1044 - Receiving error: The number of required GPUs exceeds the total number of available GPUs in the placement group by saurabhbbjain Rank Issue Milestone 1 \ud83d\udfe2 579 - [FEATURE] Sequential execution for local pipeline 1.4.0 2 \ud83d\udfe2 771 - [FEATURE] Allow passing path to YAML file containing pipeline runtime parameters in distilabel run 1.4.0 3 \ud83d\udfe2 773 - [DOCS] Include section/guide describing pipeline patterns 1.4.0 4 \ud83d\udfe2 802 - [FEATURE] Add defaults to Steps and Tasks so they can be more easily connected 1.4.0 5 \ud83d\udfe2 880 - [FEATURE] Add exclude_from_signature attribute 1.4.0 6 \ud83d\udfe2 942 - [BUG] make_generator_step can fail when setting the _dataset_info internally 1.4.0 7 \ud83d\udfe2 662 - [FEATURE] Allow passing self to steps created with step decorator 1.4.0 8 \ud83d\udfe2 889 - [FEATURE] Replace extra_sampling_params for normal arguments in vLLM 1.4.0 9 \ud83d\udfe2 738 - [FEATURE] Update LLM.generate interface to allow returning arbitrary/extra stuff related to the generation 1.5.0 10 \ud83d\udfe2 749 - [IMPLEMENTATION] Self-play with Execution Feedback: Improving Instruction-following Capabilities of Large Language Models 1.5.0 Last update: 2024-12-18 "},{"location":"sections/getting_started/faq/","title":"Frequent Asked Questions (FAQ)","text":"How can I rename the columns in a batch? Every Step has both input_mappings and output_mappings attributes that can be used to rename the columns in each batch. But input_mappings will only map, meaning that if you have a batch with the column A and you want to rename it to B , you should use input_mappings={\"A\": \"B\"} , but that will only be applied to that specific Step meaning that the next step in the pipeline will still have the column A instead of B . While output_mappings will indeed apply the rename, meaning that if the Step produces the column A and you want to rename to B , you should use output_mappings={\"A\": \"B\"} , and that will be applied to the next Step in the pipeline. Will the API Keys be exposed when sharing the pipeline? No, those will be masked out using pydantic.SecretStr , meaning that those won't be exposed when sharing the pipeline. This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via an attribute or runtime parameter, you will need to provide them again. Does it work for Windows? Yes, but you may need to set the multiprocessing context in advance to ensure that the spawn method is used since the default method fork is not available on Windows. import multiprocessing as mp\n\nmp.set_start_method(\"spawn\")\n Will the custom Steps / Tasks / LLMs be serialized too? No, at the moment, only the references to the classes within the distilabel library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available unless used from the same file. What happens if Pipeline.run fails? Do I lose all the data? No, indeed, we're using a cache mechanism to store all the intermediate results in the disk so, if a Step fails; the pipeline can be re-run from that point without losing the data, only if nothing is changed in the Pipeline . All the data will be stored in .cache/distilabel , but the only data that will persist at the end of the Pipeline.run execution is the one from the leaf step/s, so bear that in mind. For more information on the caching mechanism in distilabel , you can check the Learn - Advanced - Caching section. Also, note that when running a Step or a Task standalone, the cache mechanism won't be used, so if you want to use that, you should use the Pipeline context manager. How can I use the same LLM across several tasks without having to load it several times? You can serve the LLM using a solution like TGI or vLLM, and then connect to it using an AsyncLLM client like InferenceEndpointsLLM or OpenAILLM . Please refer to Serving LLMs guide for more information. Can distilabel be used with OpenAI Batch API? Yes, distilabel is integrated with OpenAI Batch API via OpenAILLM. Check LLMs - Offline Batch Generation for a small example on how to use it and Advanced - Offline Batch Generation for a more detailed guide. Prevent overloads on Free Serverless Endpoints When running a task using the InferenceEndpointsLLM with Free Serverless Endpoints, you may be facing some errors such as Model is overloaded if you let the batch size to the default (set at 50). To fix the issue, lower the value or even better set input_batch_size=1 in your task. It may take a longer time to finish, but please remember this is a free service. from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import TextGeneration\n\nTextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=1\n)\n "},{"location":"sections/getting_started/installation/","title":"Installation","text":"You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress. To install the latest release of the package from PyPI you can use the following command: pip install distilabel --upgrade\n Alternatively, you may also want to install it from source i.e. the latest unreleased version, you can use the following command: pip install \"distilabel @ git+https://github.com/argilla-io/distilabel.git@develop\" --upgrade\n Note We are installing from develop since that's the branch we use to collect all the features, bug fixes, and improvements that will be part of the next release. If you want to install from a specific branch, you can replace develop with the branch name. "},{"location":"sections/getting_started/installation/#extras","title":"Extras","text":"Additionally, as part of distilabel some extra dependencies are available, mainly to add support for some of the LLM integrations we support. Here's a list of the available extras: "},{"location":"sections/getting_started/installation/#llms","title":"LLMs","text":" -
anthropic : for using models available in Anthropic API via the AnthropicLLM integration. -
argilla : for exporting the generated datasets to Argilla. -
cohere : for using models available in Cohere via the CohereLLM integration. -
groq : for using models available in Groq using groq Python client via the GroqLLM integration. -
hf-inference-endpoints : for using the Hugging Face Inference Endpoints via the InferenceEndpointsLLM integration. -
hf-transformers : for using models available in transformers package via the TransformersLLM integration. -
litellm : for using LiteLLM to call any LLM using OpenAI format via the LiteLLM integration. -
llama-cpp : for using llama-cpp-python Python bindings for llama.cpp via the LlamaCppLLM integration. -
mistralai : for using models available in Mistral AI API via the MistralAILLM integration. -
ollama : for using Ollama and their available models via OllamaLLM integration. -
openai : for using OpenAI API models via the OpenAILLM integration, or the rest of the integrations based on OpenAI and relying on its client as AnyscaleLLM , AzureOpenAILLM , and TogetherLLM . -
vertexai : for using Google Vertex AI proprietary models via the VertexAILLM integration. -
vllm : for using vllm serving engine via the vLLM integration. -
sentence-transformers : for generating sentence embeddings using sentence-transformers. "},{"location":"sections/getting_started/installation/#data-processing","title":"Data processing","text":" -
ray : for scaling and distributing a pipeline with Ray. -
faiss-cpu and faiss-gpu : for generating sentence embeddings using faiss. -
minhash : for using minhash for duplicate detection with datasketch and nltk. -
text-clustering : for using text clustering with UMAP and Scikit-learn. "},{"location":"sections/getting_started/installation/#structured-generation","title":"Structured generation","text":""},{"location":"sections/getting_started/installation/#recommendations-notes","title":"Recommendations / Notes","text":"The mistralai dependency requires Python 3.9 or higher, so if you're willing to use the distilabel.models.llms.MistralLLM implementation, you will need to have Python 3.9 or higher. In some cases like transformers and vllm , the installation of flash-attn is recommended if you are using a GPU accelerator since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the distilabel dependencies. pip install flash-attn --no-build-isolation\n Also, if you are willing to use the llama-cpp-python integration for running local LLMs, note that the installation process may get a bit trickier depending on which OS are you using, so we recommend you to read through their Installation section in their docs. "},{"location":"sections/getting_started/quickstart/","title":"Quickstart","text":""},{"location":"sections/getting_started/quickstart/#quickstart","title":"Quickstart","text":"Distilabel provides all the tools you need to your scalable and reliable pipelines for synthetic data generation and AI-feedback. Pipelines are used to generate data, evaluate models, manipulate data, or any other general task. They are made up of different components: Steps, Tasks and LLMs, which are chained together in a directed acyclic graph (DAG). - Steps: These are the building blocks of your pipeline. Normal steps are used for basic executions like loading data, applying some transformations, or any other general task.
- Tasks: These are steps that rely on LLMs and prompts to perform generative tasks. For example, they can be used to generate data, evaluate models or manipulate data.
- LLMs: These are the models that will perform the task. They can be local or remote models, and open-source or commercial models.
Pipelines are designed to be scalable and reliable. They can be executed in a distributed manner, and they can be cached and recovered. This is useful when dealing with large datasets or when you want to ensure that your pipeline is reproducible. Besides that, pipelines are designed to be modular and flexible. You can easily add new steps, tasks, or LLMs to your pipeline, and you can also easily modify or remove them. An example architecture of a pipeline to generate a dataset of preferences is the following: "},{"location":"sections/getting_started/quickstart/#installation","title":"Installation","text":"To install the latest release with hf-inference-endpoints extra of the package from PyPI you can use the following command: pip install distilabel[hf-inference-endpoints] --upgrade\n "},{"location":"sections/getting_started/quickstart/#use-a-generic-pipeline","title":"Use a generic pipeline","text":"To use a generic pipeline for an ML task, you can use the InstructionResponsePipeline class. This class is a generic pipeline that can be used to generate data for supervised fine-tuning tasks. It uses the InferenceEndpointsLLM class to generate data based on the input data and the model. from distilabel.pipeline import InstructionResponsePipeline\n\npipeline = InstructionResponsePipeline()\ndataset = pipeline.run()\n The InstructionResponsePipeline class will use the InferenceEndpointsLLM class with the model meta-llama/Meta-Llama-3.1-8B-Instruct to generate data based on the system prompt. The output data will be a dataset with the columns instruction and response . The class uses a generic system prompt, but you can customize it by passing the system_prompt parameter to the class. Note We're actively working on building more pipelines for different tasks. If you have any suggestions or requests, please let us know! We're currently working on pipelines for classification, Direct Preference Optimization, and Information Retrieval tasks. "},{"location":"sections/getting_started/quickstart/#define-a-custom-pipeline","title":"Define a Custom pipeline","text":"In this guide we will walk you through the process of creating a simple pipeline that uses the InferenceEndpointsLLM class to generate text. The Pipeline will load a dataset that contains a column named prompt from the Hugging Face Hub via the step LoadDataFromHub and then use the InferenceEndpointsLLM class to generate text based on the dataset using the TextGeneration task. You can check the available models in the Hugging Face Model Hub and filter by Inference status . from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline( # (1)\n name=\"simple-text-generation-pipeline\",\n description=\"A simple text generation pipeline\",\n) as pipeline: # (2)\n load_dataset = LoadDataFromHub( # (3)\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n text_generation = TextGeneration( # (4)\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n ), # (5)\n system_prompt=\"You are a creative AI Assistant writer.\",\n template=\"Follow the following instruction: {{ instruction }}\" # (6)\n )\n\n load_dataset >> text_generation # (7)\n\nif __name__ == \"__main__\":\n distiset = pipeline.run( # (8)\n parameters={\n load_dataset.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n distiset.push_to_hub(repo_id=\"distilabel-example\") # (9)\n -
We define a Pipeline with the name simple-text-generation-pipeline and a description A simple text generation pipeline . Note that the name is mandatory and will be used to calculate the cache signature path, so changing the name will change the cache path and will be identified as a different pipeline. -
We are using the Pipeline context manager, meaning that every Step subclass that is defined within the context manager will be added to the pipeline automatically. -
We define a LoadDataFromHub step named load_dataset that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the pipeline.run method below, but it can also be defined within the class instance via the arg repo_id=... . This step will produce output batches with the rows from the dataset, and the column prompt will be mapped to the instruction field. -
We define a TextGeneration task named text_generation that will generate text based on the instruction field from the dataset. This task will use the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct . -
We define the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct that will be used by the TextGeneration task. In this case, since the InferenceEndpointsLLM is used, we assume that the HF_TOKEN environment variable is set. -
Both system_prompt and template are optional fields. The template must be informed as a string following the Jinja2 template format, and the fields that appear there (\"instruction\" in this case, which corresponds to the default) must be informed in the columns attribute. The component gallery for TextGeneration has examples to get you started. -
We connect the load_dataset step to the text_generation task using the rshift operator, meaning that the output from the load_dataset step will be used as input for the text_generation task. -
We run the pipeline with the parameters for the load_dataset and text_generation steps. The load_dataset step will use the repository distilabel-internal-testing/instruction-dataset-mini and the test split, and the text_generation task will use the generation_kwargs with the temperature set to 0.7 and the max_new_tokens set to 512 . -
Optionally, we can push the generated Distiset to the Hugging Face Hub repository distilabel-example . This will allow you to share the generated dataset with others and use it in other pipelines. "},{"location":"sections/how_to_guides/","title":"How-to guides","text":"Welcome to the how-to guides section! Here you will find a collection of guides that will help you get started with Distilabel. We have divided the guides into two categories: basic and advanced. The basic guides will help you get started with the core concepts of Distilabel, while the advanced guides will help you explore more advanced features. "},{"location":"sections/how_to_guides/#basic","title":"Basic","text":" -
Define Steps for your Pipeline Steps are the building blocks of your pipeline. They can be used to generate data, evaluate models, manipulate data, or any other general task. Define Steps -
Define Tasks that rely on LLMs Tasks are a specific type of step that rely on Language Models (LLMs) to generate data. Define Tasks -
Define LLMs as local or remote models LLMs are the core of your tasks. They are used to integrate with local models or remote APIs. Define LLMs -
Execute Steps and Tasks in a Pipeline Pipeline is where you put all your steps and tasks together to create a workflow. Execute Pipeline "},{"location":"sections/how_to_guides/#advanced","title":"Advanced","text":" -
Using the Distiset dataset object Distiset is a dataset object based on the datasets library that can be used to store and manipulate data. Distiset -
Export data to Argilla Argilla is a platform that can be used to store, search, and apply feedback to datasets. Argilla -
Using a file system to pass data of batches between steps File system can be used to pass data between steps in a pipeline. File System -
Using CLI to explore and re-run existing Pipelines CLI can be used to explore and re-run existing pipelines through the command line. CLI -
Cache and recover pipeline executions Caching can be used to recover pipeline executions to avoid loosing data and precious LLM calls. Caching -
Structured data generation Structured data generation can be used to generate data with a specific structure like JSON, function calls, etc. Structured Generation -
Serving an LLM for sharing it between several tasks Serve an LLM via TGI or vLLM to make requests and connect using a client like InferenceEndpointsLLM or OpenAILLM to avoid wasting resources. Sharing an LLM across tasks -
Impose requirements to your pipelines and steps Add requirements to steps in a pipeline to ensure they are installed and avoid errors. Pipeline requirements "},{"location":"sections/how_to_guides/advanced/argilla/","title":"Export data to Argilla","text":"Being able to export the generated synthetic datasets to Argilla, is a core feature within distilabel . We believe in the potential of synthetic data, but without removing the impact a human annotator or group of annotators can bring. So on, the Argilla integration makes it straightforward to push a dataset to Argilla while the Pipeline is running, to be able to follow along the generation process in Argilla's UI, as well as annotating the records on the fly. One can include a Step within the Pipeline to easily export the datasets to Argilla with a pre-defined configuration, suiting the annotation purposes. Before using any of the steps about to be described below, you should first have an Argilla instance up and running, so that you can successfully upload the data to Argilla. In order to deploy Argilla, the easiest and most straightforward way is to deploy it via the Argilla Template in Hugging Face Spaces as simply as following the steps there, or just via the following button: "},{"location":"sections/how_to_guides/advanced/argilla/#text-generation","title":"Text Generation","text":"For text generation scenarios, i.e. when the Pipeline contains a single TextGeneration step, we have designed the task TextGenerationToArgilla , which will seamlessly push the generated data to Argilla, and allow the annotator to review the records. The dataset will be pushed with the following configuration: -
Fields: instruction and generation , both being fields of type argilla.TextField , plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generation can either be a single string or a list of strings (useful when there are multiple parent nodes of type TextGeneration ); even though each record will always contain at most one instruction -generation pair. -
Questions: quality will be the only question for the annotators to answer, i.e., to annotate, and it will be an argilla.LabelQuestion referring to the quality of the provided generation for the given instruction. It can be annotated as either \ud83d\udc4e (bad) or \ud83d\udc4d (good). Note The TextGenerationToArgilla step will only work as is if the Pipeline contains one or multiple TextGeneration steps, or if the columns instruction and generation are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generation are mapped to one of the existing columns in the batch data. from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, TextGenerationToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[\n {\n \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n },\n ],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n )\n\n to_argilla = TextGenerationToArgilla(\n dataset_name=\"my-dataset\",\n dataset_workspace=\"admin\",\n api_url=\"<ARGILLA_API_URL>\",\n api_key=\"<ARGILLA_API_KEY>\",\n )\n\n load_dataset >> text_generation >> to_argilla\n\npipeline.run()\n "},{"location":"sections/how_to_guides/advanced/argilla/#preference","title":"Preference","text":"For preference scenarios, i.e. when the Pipeline contains multiple TextGeneration steps, we have designed the task PreferenceToArgilla , which will seamlessly push the generated data to Argilla, and allow the annotator to review the records. The dataset will be pushed with the following configuration: -
Fields: instruction and generations , both being fields of type argilla.TextField , plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generations must be a list of strings, containing the generated texts for the given instruction so that at least there are two generations to compare. Other than that, the number of generation fields within each record in Argilla will be defined by the value of the variable num_generations to be provided in the PreferenceToArgilla step. -
Questions: rating and rationale will be the pairs of questions to be defined per each generation i.e. per each value within the range from 0 to num_generations , and those will be of types argilla.RatingQuestion and argilla.TextQuestion , respectively. Note that only the first pair of questions will be mandatory, since only one generation is ensured to be within the batch data. Additionally, note that the provided ratings will range from 1 to 5, and to mention that Argilla only supports values above 0. Note The PreferenceToArgilla step will only work if the Pipeline contains multiple TextGeneration steps, or if the columns instruction and generations are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generations are mapped to one of the existing columns in the batch data. Note Additionally, if the Pipeline contains an UltraFeedback step, the ratings and rationales will also be available and be automatically injected as suggestions to the existing dataset. from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, PreferenceToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[\n {\n \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n },\n ],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n num_generations=4,\n group_generations=True,\n )\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"my-dataset\",\n dataset_workspace=\"admin\",\n api_url=\"<ARGILLA_API_URL>\",\n api_key=\"<ARGILLA_API_KEY>\",\n num_generations=4,\n )\n\n load_dataset >> text_generation >> to_argilla\n\nif __name__ == \"__main__\":\n pipeline.run()\n "},{"location":"sections/how_to_guides/advanced/assigning_resources_to_step/","title":"Assigning resources to a Step ","text":"When dealing with complex pipelines that get executed in a distributed environment with abundant resources (CPUs and GPUs), sometimes it's necessary to allocate these resources judiciously among the Step s. This is why distilabel allows to specify the number of replicas , cpus and gpus for each Step . Let's see that with an example: from distilabel.pipeline import Pipeline\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import PrometheusEval\n\n\nwith Pipeline(name=\"resources\") as pipeline:\n ...\n\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]['content'] }}\\\\n{{ messages[1]['content'] }}[/INST]\",\n ),\n resources=StepResources(replicas=2, cpus=1, gpus=1)\n mode=\"absolute\",\n rubric=\"factual-validity\",\n reference=False,\n num_generations=1,\n group_generations=False,\n )\n In the example above, we're creating a PrometheusEval task (remember that Task s are Step s) that will use vLLM to serve prometheus-eval/prometheus-7b-v2.0 model. This task is resource intensive as it requires an LLM, which in turn requires a GPU to run fast. With that in mind, we have specified the resources required for the task using the StepResources class, and we have defined that we need 1 GPU and 1 CPU per replica of the task. In addition, we have defined that we need 2 replicas i.e. we will run two instances of the task so the computation for the whole dataset runs faster. In addition, StepResources uses the RuntimeParametersMixin, so we can also specify the resources for each step when running the pipeline: ...\n\nif __name__ == \"__main__\":\n pipeline.run(\n parameters={\n prometheus.name: {\"resources\": {\"replicas\": 2, \"cpus\": 1, \"gpus\": 1}}\n }\n )\n And that's it! When running the pipeline, distilabel will create the tasks in nodes that have available the specified resources. "},{"location":"sections/how_to_guides/advanced/caching/","title":"Pipeline cache","text":"distilabel will automatically save all the intermediate outputs generated by each Step of a Pipeline , so these outputs can be reused to recover the state of a pipeline execution that was stopped before finishing or to not have to re-execute steps from a pipeline after adding a new downstream step. "},{"location":"sections/how_to_guides/advanced/caching/#how-to-enabledisable-the-cache","title":"How to enable/disable the cache","text":"The use of the cache can be toggled using the use_cache parameter of the Pipeline.use_cache method. If True , then distilabel will use the reuse the outputs of previous executions for the new execution. If False , then distilabel will re-execute all the steps of the pipeline to generate new outputs for all the steps. with Pipeline(name=\"my-pipeline\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False) # (1)\n - Pipeline cache is disabled
In addition, the cache can be enabled/disabled at Step level using its use_cache attribute. If True , then the outputs of the step will be reused in the new pipeline execution. If False , then the step will be re-executed to generate new outputs. If the cache of one step is disabled and the outputs have to be regenerated, then the outputs of the steps that depend on this step will also be regenerated. with Pipeline(name=\"writting-assistant\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"How much is 2+2?\"\n }\n ]\n )\n\n generation = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"Qwen/Qwen2.5-72B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 512,\n },\n ),\n use_cache=False # (1)\n )\n\n load_data >> generation\n\nif __name__ == \"__main__\":\n distiset = pipeline.run()\n - Step cache is disabled and every time the pipeline is executed, this step will be re-executed
"},{"location":"sections/how_to_guides/advanced/caching/#how-a-cache-hit-is-triggered","title":"How a cache hit is triggered","text":"distilabel groups information and data generated by a Pipeline using the name of the pipeline, so the first factor that triggers a cache hit is the name of the pipeline. The second factor, is the Pipeline.signature property. This property returns a hash that is generated using the names of the steps used in the pipeline and their connections. The third factor, is the Pipeline.aggregated_steps_signature property which is used to determine if the new pipeline execution is exactly the same as one of the previous i.e. the pipeline contains exactly the same steps, with exactly the same connections and the steps are using exactly the same parameters. If these three factors are met, then the cache hit is triggered and the pipeline won't get re-executed and instead the function create_distiset will be used to create the resulting Distiset using the outputs of the previous execution, as it can be seen in the following image: If the new pipeline execution have a different Pipeline.aggregated_steps_signature i.e. at least one step has changed its parameters, distilabel will reuse the outputs of the steps that have not changed and re-execute the steps that have changed, as it can be seen in the following image: The same pipeline from above gets executed a third time, but this time the last step text_generation_1 changed, so it's needed to re-execute it. The other steps, as they have not been changed, doesn't need to be re-executed and their outputs are reused. "},{"location":"sections/how_to_guides/advanced/distiset/","title":"Using the Distiset dataset object","text":"A Pipeline in distilabel returns a special type of Hugging Face datasets.DatasetDict which is called Distiset . The Distiset is a dictionary-like object that contains the different configurations generated by the Pipeline , where each configuration corresponds to each leaf step in the DAG built by the Pipeline . Each configuration corresponds to a different subset of the dataset. This is a concept taken from \ud83e\udd17 datasets that lets you upload different configurations of the same dataset within the same repository and can contain different columns i.e. different configurations, which can be seamlessly pushed to the Hugging Face Hub. Below you can find an example of how to create a Distiset object that resembles a datasets.DatasetDict : from datasets import Dataset\nfrom distilabel.distiset import Distiset\n\ndistiset = Distiset(\n {\n \"leaf_step_1\": Dataset.from_dict({\"instruction\": [1, 2, 3]}),\n \"leaf_step_2\": Dataset.from_dict(\n {\"instruction\": [1, 2, 3, 4], \"generation\": [5, 6, 7, 8]}\n ),\n }\n)\n Note If there's only one leaf node, i.e., only one step at the end of the Pipeline , then the configuration name won't be the name of the last step, but it will be set to \"default\" instead, as that's more aligned with standard datasets within the Hugging Face Hub. "},{"location":"sections/how_to_guides/advanced/distiset/#distiset-methods","title":"Distiset methods","text":"We can interact with the different pieces generated by the Pipeline and treat them as different configurations . The Distiset contains just two methods: "},{"location":"sections/how_to_guides/advanced/distiset/#traintest-split","title":"Train/Test split","text":"Create a train/test split partition of the dataset for the different configurations or subsets. >>> distiset.train_test_split(train_size=0.9)\nDistiset({\n leaf_step_1: DatasetDict({\n train: Dataset({\n features: ['instruction'],\n num_rows: 2\n })\n test: Dataset({\n features: ['instruction'],\n num_rows: 1\n })\n })\n leaf_step_2: DatasetDict({\n train: Dataset({\n features: ['instruction', 'generation'],\n num_rows: 3\n })\n test: Dataset({\n features: ['instruction', 'generation'],\n num_rows: 1\n })\n })\n})\n "},{"location":"sections/how_to_guides/advanced/distiset/#push-to-hugging-face-hub","title":"Push to Hugging Face Hub","text":"Push the Distiset to a Hugging Face repository, where each one of the subsets will correspond to a different configuration: distiset.push_to_hub(\n \"my-org/my-dataset\",\n commit_message=\"Initial commit\",\n private=False,\n token=os.getenv(\"HF_TOKEN\"),\n generate_card=True,\n include_script=False\n)\n New since version 1.3.0 Since version 1.3.0 you can automatically push the script that created your pipeline to the same repository. For example, assuming you have a file like the following: sample_pipe.pywith Pipeline() as pipe:\n ...\ndistiset = pipe.run()\ndistiset.push_to_hub(\n \"my-org/my-dataset,\n include_script=True\n)\n After running the command, you could visit the repository and the file sample_pipe.py will be stored to simplify sharing your pipeline with the community. "},{"location":"sections/how_to_guides/advanced/distiset/#custom-docstrings","title":"Custom Docstrings","text":"distilabel contains a custom plugin to automatically generates a gallery for the different components. The information is extracted by parsing the Step 's docstrings. You can take a look at the docstrings in the source code of the UltraFeedback, and take a look at the corresponding entry in the components gallery to see an example of how the docstrings are rendered. If you create your own components and want the Citations automatically rendered in the README card (in case you are sharing your final distiset in the Hugging Face Hub), you may want to add the citation section. This is an example for the MagpieGenerator Task: class MagpieGenerator(GeneratorTask, MagpieBase):\n r\"\"\"Generator task the generates instructions or conversations using Magpie.\n ...\n\n Citations:\n\n ```\n @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n }\n ```\n \"\"\"\n The Citations section can include any number of bibtex references. To define them, you can add as much elements as needed just like in the example: each citation will be a block of the form: ```@misc{...}``` . This information will be automatically used in the README of your Distiset if you decide to call distiset.push_to_hub . Alternatively, if the Citations is not found, but in the References there are found any urls pointing to https://arxiv.org/ , we will try to obtain the Bibtex equivalent automatically. This way, Hugging Face can automatically track the paper for you and it's easier to find other datasets citing the same paper, or directly visiting the paper page. "},{"location":"sections/how_to_guides/advanced/distiset/#save-and-load-from-disk","title":"Save and load from disk","text":"Take into account that these methods work as datasets.load_from_disk and datasets.Dataset.save_to_disk so the arguments are directly passed to those methods. This means you can also make use of storage_options argument to save your Distiset in your cloud provider, including the distilabel artifacts (pipeline.yaml , pipeline.log and the README.md with the dataset card). You can read more in datasets documentation here. Save to diskLoad from disk (local)Load from disk (cloud) Save the Distiset to disk, and optionally (will be done by default) saves the dataset card, the pipeline config file and logs: distiset.save_to_disk(\n \"my-dataset\",\n save_card=True,\n save_pipeline_config=True,\n save_pipeline_log=True\n)\n Load a Distiset that was saved using Distiset.save_to_disk just the same way: distiset = Distiset.load_from_disk(\"my-dataset\")\n Load a Distiset from a remote location, like S3, GCS. You can pass the storage_options argument to authenticate with the cloud provider: distiset = Distiset.load_from_disk(\n \"s3://path/to/my_dataset\", # gcs:// or any filesystem tolerated by fsspec\n storage_options={\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n ...\n }\n)\n Take a look at the remaining arguments at Distiset.save_to_disk and Distiset.load_from_disk . "},{"location":"sections/how_to_guides/advanced/distiset/#dataset-card","title":"Dataset card","text":"Having this special type of dataset comes with an added advantage when calling Distiset.push_to_hub , which is the automatically generated dataset card in the Hugging Face Hub. Note that it is enabled by default, but can be disabled by setting generate_card=False : distiset.push_to_hub(\"my-org/my-dataset\", generate_card=True)\n We will have an automatic dataset card (an example can be seen here) with some handy information like reproducing the Pipeline with the CLI , or examples of the records from the different subsets. "},{"location":"sections/how_to_guides/advanced/distiset/#create_distiset-helper","title":"create_distiset helper","text":"Lastly, we presented in the caching section the create_distiset function, you can take a look at the section to see how to create a Distiset from the cache folder, using the helper function to automatically include all the relevant data. "},{"location":"sections/how_to_guides/advanced/fs_to_pass_data/","title":"Using a file system to pass data of batches between steps","text":"In some situations, it can happen that the batches contains so much data that is faster to write it to disk and read it back in the next step, instead of passing it using the queue. To solve this issue, distilabel uses fsspec to allow providing a file system configuration and whether if this file system should be used to pass data between steps in the run method of the distilabel pipelines: Warning In order to use a specific file system/cloud storage, you will need to install the specific package providing the fsspec implementation for that file system. For instance, to use Google Cloud Storage you will need to install gcsfs : pip install gcsfs\n Check the available implementations: fsspec - Other known implementations from distilabel.pipeline import Pipeline\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n ..., \n storage_parameters={\"path\": \"gcs://my-bucket\"},\n use_fs_to_pass_data=True\n )\n The code above setups a file system (in this case Google Cloud Storage) and sets the flag use_fs_to_pass_data to specify that the data of the batches should be passed to the steps using the file system. The storage_parameters argument is optional, and in the case it's not provided but use_fs_to_pass_data==True , distilabel will use the local file system. Note As GlobalStep s receives all the data from the previous steps in one single batch accumulating all the data, it's very likely that the data of the batch will be too big to be passed using the queue. In this case and even if use_fs_to_pass_data==False , distilabel will use the file system to pass the data to the GlobalStep . "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/","title":"Load groups and execution stages","text":"By default, the distilabel architecture loads all steps of a pipeline at the same time, as they are all supposed to process batches of data in parallel. However, loading all steps at once can waste resources in two scenarios: when using GlobalStep s that must wait for upstream steps to complete before processing data, or when running on machines with limited resources that cannot execute all steps simultaneously. In these cases, steps need to be loaded and executed in distinct load stages. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-stages","title":"Load stages","text":"A load stage represents a point in the pipeline execution where a group of steps are loaded at the same time to process batches in parallel. These stages are required because: - There are some kind of steps like the
GlobalStep s that needs to receive all the data at once from their upstream steps i.e. needs their upstream steps to have finished its execution. It would be wasteful to load a GlobalStep at the same time as other steps of the pipeline as that would take resources (from the machine or cluster running the pipeline) that wouldn't be used until upstream steps have finished. - When running on machines or clusters with limited resources, it may be not possible to load and execute all steps simultaneously as they would need to access the same limited resources (memory, CPU, GPU, etc.).
Having that said, the first element that will create a load stage when executing a pipeline are the GlobalStep , as they mark and divide a pipeline in three stages: one stage with the upstream steps of the global step, one stage with the global step, and one final stage with the downstream steps of the global step. For example, the following pipeline will contain three stages: from typing import TYPE_CHECKING\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts, StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.typing import StepOutput\n\n\n@step(inputs=[\"instruction\"], outputs=[\"instruction2\"])\ndef DummyStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"instruction2\"] = \"miau\"\n yield inputs\n\n\n@step(inputs=[\"instruction\"], outputs=[\"instruction2\"], step_type=\"global\")\ndef GlobalDummyStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"instruction2\"] = \"miau\"\n yield inputs\n\n\nwith Pipeline() as pipeline:\n generator = LoadDataFromDicts(data=[{\"instruction\": \"Hi\"}] * 50)\n dummy_step_0 = DummyStep()\n global_dummy_step = GlobalDummyStep()\n dummy_step_1 = DummyStep()\n\n generator >> dummy_step_0 >> global_dummy_step >> dummy_step_1\n\nif __name__ == \"__main__\":\n load_stages = pipeline.get_load_stages()\n\n for i, steps_stage in enumerate(load_stages[0]):\n print(f\"Stage {i}: {steps_stage}\")\n\n # Output:\n # Stage 0: ['load_data_from_dicts_0', 'dummy_step_0']\n # Stage 1: ['global_dummy_step_0']\n # Stage 2: ['dummy_step_1']\n As we can see, the GlobalStep divided the pipeline execution in three stages. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-groups","title":"Load groups","text":"While GlobalStep s automatically divide pipeline execution into stages, we many need fine-grained control over how steps are loaded and executed within each stage. This is where load groups come in. Load groups allows to specify which steps of the pipeline have to be loaded together within a stage. This is particularly useful when running on resource-constrained environments where all the steps cannot be executed in parallel. Let's see how it works with an example: from datasets import load_dataset\n\nfrom distilabel.llms import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import TextGeneration\n\ndataset = load_dataset(\n \"distilabel-internal-testing/instruction-dataset-mini\", split=\"test\"\n).rename_column(\"prompt\", \"instruction\")\n\nwith Pipeline() as pipeline:\n text_generation_0 = TextGeneration(\n llm=vLLM(\n model=\"HuggingFaceTB/SmolLM2-1.7B-Instruct\",\n extra_kwargs={\"max_model_len\": 1024},\n ),\n resources=StepResources(gpus=1),\n )\n\n text_generation_1 = TextGeneration(\n llm=vLLM(\n model=\"HuggingFaceTB/SmolLM2-1.7B-Instruct\",\n extra_kwargs={\"max_model_len\": 1024},\n ),\n resources=StepResources(gpus=1),\n )\n\nif __name__ == \"__main__\":\n load_stages = pipeline.get_load_stages(load_groups=[[text_generation_1.name]])\n\n for i, steps_stage in enumerate(load_stages[0]):\n print(f\"Stage {i}: {steps_stage}\")\n\n # Output:\n # Stage 0: ['text_generation_0']\n # Stage 1: ['text_generation_1']\n\n distiset = pipeline.run(dataset=dataset, load_groups=[[text_generation_0.name]])\n In this example, we're working with a machine that has a single GPU, but the pipeline includes two instances of TextGeneration tasks both using vLLM and requesting 1 GPU. We cannot execute both steps in parallel. To fix that, we specify in the run method using the load_groups argument that the text_generation_0 step has to be executed in isolation in a stage. This way, we can run the pipeline on a single GPU machine by executing the steps in different stages (sequentially) instead of in parallel. Some key points about load groups: - Load groups are specified as a list of lists, where each inner list represents a group of steps that should be loaded together.
- Same as
GlobalSteps s, the load groups creates a new load stage dividing the pipeline in 3 stages: one for the upstream steps, one for the steps in the load group, and one for the downstream steps. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-groups-modes","title":"Load groups modes","text":"In addition, distilabel allows passing some modes to the load_groups argument that will handle the creation of the load groups: \"sequential_step_execution\" : when passed, it will create a load group for each step i.e. the execution of the steps of the pipeline will be sequential. "},{"location":"sections/how_to_guides/advanced/offline_batch_generation/","title":"Offline Batch Generation","text":"The offline batch generation is a feature that some LLM s implemented in distilabel offers, allowing to send the inputs to a LLM-as-a-service platform and waiting for the outputs in a asynchronous manner. LLM-as-a-service platforms offer this feature as it allows them to gather many inputs and creating batches as big as the hardware allows, maximizing the hardware utilization and reducing the cost of the service. In exchange, the user has to wait certain time for the outputs to be ready but the cost per token is usually much lower. distilabel pipelines are able to handle LLM s that offer this feature in the following way: - The first time the pipeline gets executed, the
LLM will send the inputs to the platform. The platform will return jobs ids that can be used later to check the status of the jobs and retrieve the results. The LLM will save these jobs ids in its jobs_ids attribute and raise an special exception DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the Pipeline . The jobs ids will be saved in the pipeline cache, so they can be used in subsequent calls. - The second time and subsequent calls will recover the pipeline execution and the
LLM won't send the inputs again to the platform. This time as it has the jobs_ids it will check if the jobs have finished, and if they have then it will retrieve the results and return the outputs. If they haven't finished, then it will raise again DistilabelOfflineBatchGenerationNotFinishedException again. - In addition, LLMs with offline batch generation can be specified to do polling until the jobs have finished, blocking the pipeline until they are done. If for some reason the polling needs to be stopped, one can press Ctrl+C or Cmd+C depending on your OS (or send a
SIGINT to the main process) which will stop the polling and raise DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the pipeline as described above. Warning In order to recover the pipeline execution and retrieve the results, the pipeline cache must be enabled. If the pipeline cache is disabled, then it will send the inputs again and create different jobs incurring in extra costs. "},{"location":"sections/how_to_guides/advanced/offline_batch_generation/#example-pipeline-using-openaillm-with-offline-batch-generation","title":"Example pipeline using OpenAILLM with offline batch generation","text":"from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline() as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True, # (1)\n )\n )\n\n load_data >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset\",\n \"split\": \"test\",\n \"batch_size\": 500,\n },\n }\n )\n - Indicate that the
OpenAILLM should use offline batch generation. "},{"location":"sections/how_to_guides/advanced/pipeline_requirements/","title":"Add requirements to run a Pipeline","text":"When sharing a Pipeline that contains custom Step s or Task s, you may want to add the specific requirements that are needed to run them. distilabel will take this list of requirements and warn the user if any are missing. Let's see how we can add additional requirements with an example. The first thing we're going to do is to add requirements for our CustomStep . To do so we use the requirements decorator to specify that the step has nltk>=3.8 as dependency (we can use version specifiers). In addition, we're going to specify at Pipeline level that we need distilabel>=1.3.0 to run it. from typing import List\n\nfrom distilabel.steps import Step\nfrom distilabel.steps.base import StepInput\nfrom distilabel.steps.typing import StepOutput\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.utils.requirements import requirements\nfrom distilabel.pipeline import Pipeline\n\n\n@requirements([\"nltk\"])\nclass CustomStep(Step):\n @property\n def inputs(self) -> List[str]:\n return [\"instruction\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"response\"]\n\n def process(self, inputs: StepInput) -> StepOutput: # type: ignore\n for input in inputs:\n input[\"response\"] = nltk.word_tokenize(input)\n yield inputs\n\n\nwith Pipeline(\n name=\"pipeline-with-requirements\", requirements=[\"distilabel>=1.3.0\"]\n) as pipeline:\n loader = LoadDataFromDicts(data=[{\"instruction\": \"sample sentence\"}])\n step1 = CustomStep()\n loader >> step1\n\nif __name__ == \"__main__\":\n pipeline.run()\n Once we call pipeline.run() , if any of the requirements informed at the Step or Pipeline level is missing, a ValueError will be raised telling us that we should install the list of dependencies: >>> pipeline.run()\n[06/27/24 11:07:33] ERROR ['distilabel.pipeline'] Please install the following requirements to run the pipeline: base.py:350\n distilabel>=1.3.0\n...\nValueError: Please install the following requirements to run the pipeline:\ndistilabel>=1.3.0\n "},{"location":"sections/how_to_guides/advanced/saving_step_generated_artifacts/","title":"Saving step generated artifacts","text":"Some Step s might need to produce an auxiliary artifact that is not a result of the computation, but is needed for the computation. For example, the FaissNearestNeighbour needs to create a Faiss index to compute the output of the step which are the top k nearest neighbours for each input. Generating the Faiss index takes time and it could potentially be reused outside of the distilabel pipeline, so it would be a shame not saving it. For this reason, Step s have a method called save_artifact that allows saving artifacts that will be included along the outputs of the pipeline in the generated Distiset . The generated artifacts will be uploaded and saved when using Distiset.push_to_hub or Distiset.save_to_disk respectively. Let's see how to use it with a simple example. from typing import List, TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\nimport matplotlib.pyplot as plt\n\nif TYPE_CHECKING:\n from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"text_character_count\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n character_counts = []\n\n for input in inputs:\n text_character_count = len(input[\"text\"])\n input[\"text_character_count\"] = text_character_count\n character_counts.append(text_character_count)\n\n # Generate plot with the distribution of text character counts\n plt.figure(figsize=(10, 6))\n plt.hist(character_counts, bins=30, edgecolor=\"black\")\n plt.title(\"Distribution of Text Character Counts\")\n plt.xlabel(\"Character Count\")\n plt.ylabel(\"Frequency\")\n\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"text_character_count_distribution\",\n write_function=lambda path: plt.savefig(path / \"figure.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n\n plt.close()\n\n yield inputs\n As it can be seen in the example above, we have created a simple step that counts the number of characters in each input text and generates a histogram with the distribution of the character counts. We save the histogram as an artifact of the step using the save_artifact method. The method takes three arguments: name : The name we want to give to the artifact. write_function : A function that writes the artifact to the desired path. The function will receive a path argument which is a pathlib.Path object pointing to the directory where the artifact should be saved. metadata : A dictionary with metadata about the artifact. This metadata will be saved along with the artifact. Let's execute the step with a simple pipeline and push the resulting Distiset to the Hugging Face Hub: Example full code from typing import TYPE_CHECKING, List\n\nimport matplotlib.pyplot as plt\nfrom datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\n\nif TYPE_CHECKING:\n from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"text_character_count\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n character_counts = []\n\n for input in inputs:\n text_character_count = len(input[\"text\"])\n input[\"text_character_count\"] = text_character_count\n character_counts.append(text_character_count)\n\n # Generate plot with the distribution of text character counts\n plt.figure(figsize=(10, 6))\n plt.hist(character_counts, bins=30, edgecolor=\"black\")\n plt.title(\"Distribution of Text Character Counts\")\n plt.xlabel(\"Character Count\")\n plt.ylabel(\"Frequency\")\n\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"text_character_count_distribution\",\n write_function=lambda path: plt.savefig(path / \"figure.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n\n plt.close()\n\n yield inputs\n\n\nwith Pipeline() as pipeline:\n count_text_characters = CountTextCharacters()\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n dataset=load_dataset(\n \"HuggingFaceH4/instruction-dataset\", split=\"test\"\n ).rename_column(\"prompt\", \"text\"),\n )\n\n distiset.push_to_hub(\"distilabel-internal-testing/distilabel-artifacts-example\")\n The generated distilabel-internal-testing/distilabel-artifacts-example dataset repository has a section in its card describing the artifacts generated by the pipeline and the generated plot can be seen here. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/","title":"Scaling and distributing a pipeline with Ray","text":"Although the local Pipeline based on multiprocessing + serving LLMs with an external service is enough for executing most of the pipelines used to create SFT and preference datasets, there are scenarios where you might need to scale your pipeline across multiple machines. In such cases, distilabel leverages Ray to distribute the workload efficiently. This allows you to generate larger datasets, reduce execution time, and maximize resource utilization across a cluster of machines, without needing to change a single line of code. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#relation-between-distilabel-steps-and-ray-actors","title":"Relation between distilabel steps and Ray Actors","text":"A distilabel pipeline consist of several Step s. An Step is a class that defines a basic life-cycle: - It will load or create the resources (LLMs, clients, etc) required to run its logic.
- It will run a loop waiting for incoming batches received using a queue. Once it receives one batch, it will process it and put the processed batch into an output queue.
- When it finish a batch that is the final one or receives a special signal, the loop will finish and the unload logic will be executed.
So an Step needs to maintain a minimum state and the best way to do that with Ray is using actors. graph TD\n A[Step] -->|has| B[Multiple Replicas]\n B -->|wrapped in| C[Ray Actor]\n C -->|maintains| D[Step Replica State]\n C -->|executes| E[Step Lifecycle]\n E -->|1. Load/Create Resources| F[LLMs, Clients, etc.]\n E -->|2. Process batches from| G[Input Queue]\n E -->|3. Processed batches are put in| H[Output Queue]\n E -->|4. Unload| I[Cleanup]\n "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-pipeline-with-ray","title":"Executing a pipeline with Ray","text":"The recommended way to execute a distilabel pipeline using Ray is using the Ray Jobs API. Before jumping on the explanation, let's first install the prerequisites: pip install distilabel[ray]\n Tip It's recommended to create a virtual environment. For the purpose of explaining how to execute a pipeline with Ray, we'll use the following pipeline throughout the examples: from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n )\n )\n\n load_data_from_hub >> text_generation\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data_from_hub.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n }\n },\n \"resources\": {\"replicas\": 2, \"gpus\": 1}, # (1)\n },\n }\n )\n\n distiset.push_to_hub(\n \"<YOUR_HF_USERNAME_OR_ORGANIZATION>/text-generation-distilabel-ray\" # (2)\n )\n - We're setting resources for the
text_generation step and defining that we want two replicas and one GPU per replica. distilabel will create two replicas of the step i.e. two actors in the Ray cluster, and each actor will request to be allocated in a node of the cluster that have at least one GPU. You can read more about how Ray manages the resources here. - You should modify this and add your user or organization on the Hugging Face Hub.
It's a basic pipeline with just two steps: one to load a dataset from the Hub with an instruction column and one to generate a response for that instruction using Llama 3 8B Instruct with vLLM. Simple but enough to demonstrate how to distribute and scale the workload using a Ray cluster! "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#using-ray-jobs-api","title":"Using Ray Jobs API","text":"If you don't know the Ray Jobs API then it's recommended to read Ray Jobs Overview. Quick summary: Ray Jobs is the recommended way to execute a job in a Ray cluster as it will handle packaging, deploying and managing the Ray application. To execute the pipeline above, we first need to create a directory (kind of a package) with the pipeline script (or scripts) that we will submit to the Ray cluster: mkdir ray-pipeline\n The content of the directory ray-pipeline should be: ray-pipeline/\n\u251c\u2500\u2500 pipeline.py\n\u2514\u2500\u2500 runtime_env.yaml\n The first file contains the code of the pipeline, while the second one (runtime_env.yaml ) is a specific Ray file containing the environment dependencies required to run the job: pip:\n - distilabel[ray,vllm] >= 1.3.0\nenv_vars:\n HF_TOKEN: <YOUR_HF_TOKEN>\n With this file we're basically informing to the Ray cluster that it will have to install distilabel with the vllm and ray extra dependencies to be able to run the job. In addition, we're defining the HF_TOKEN environment variable that will be used (by the push_to_hub method) to upload the resulting dataset to the Hugging Face Hub. After that, we can proceed to execute the ray command that will submit the job to the Ray cluster: ray job submit \\\n --address http://localhost:8265 \\\n --working-dir ray-pipeline \\\n --runtime-env ray-pipeline/runtime_env.yaml -- python pipeline.py\n What this will do, it's to basically upload the --working-dir to the Ray cluster, install the dependencies and then execute the python pipeline.py command from the head node. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#file-system-requirements","title":"File system requirements","text":"As described in Using a file system to pass data to steps, distilabel relies on the file system to pass the data to the GlobalStep s, so if the pipeline to be executed in the Ray cluster have any GlobalStep or do you want to set the use_fs_to_pass_data=True of the run method, then you will need to setup a file system to which all the nodes of the Ray cluster have access: if __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={...},\n storage_parameters={\"path\": \"file:///mnt/data\"}, # (1)\n use_fs_to_pass_data=True,\n )\n - All the nodes of the Ray cluster should have access to
/mnt/data . "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-raypipeline-in-a-cluster-with-slurm","title":"Executing a RayPipeline in a cluster with Slurm","text":"If you have access to an HPC, then you're probably also a user of Slurm, a workload manager typically used on HPCs. We can create Slurm job that takes some nodes and deploy a Ray cluster to run a distributed distilabel pipeline: #!/bin/bash\n#SBATCH --job-name=distilabel-ray-text-generation\n#SBATCH --partition=your-partition\n#SBATCH --qos=normal\n#SBATCH --nodes=2 # (1)\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1 # (2)\n#SBATCH --gpus-per-node=1 # (3)\n#SBATCH --time=0:30:00\n\nset -ex\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\n# Activate virtual environment\nsource /path/to/virtualenv/.venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node (just in case the default one is not writable)\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nOUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$head_node\" \\ # (4)\n ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n --dashboard-host=0.0.0.0 \\\n --dashboard-port=8265 \\\n --temp-dir=\"$head_tmp_dir\" \\\n --block &\n\n# Give some time to head node to start...\necho \"Waiting a bit before starting worker nodes...\"\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n node_i=${nodes_array[$i]}\n worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n echo \"Starting WORKER $i at $node_i\"\n OUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n ray start --address \"$ip_head\" \\\n --temp-dir=\"$worker_tmp_dir\" \\\n --block &\n sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\necho \"Waiting a bit before submitting the job...\"\nsleep 60\n\n# Finally submit the job to the cluster\nray job submit --address http://localhost:8265 --working-dir ray-pipeline -- python -u pipeline.py\n - In this case, we just want two nodes: one to run the Ray head node and one to run a worker.
- We just want to run a task per node i.e. the Ray command that starts the head/worker node.
- We have selected 1 GPU per node, but we could have selected more depending on the pipeline.
- We need to set the environment variable
OUTLINES_CACHE_DIR to /tmp/.outlines to avoid issues with the nodes trying to read/write the same outlines cache files, which is not possible. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#vllm-and-tensor_parallel_size","title":"vLLM and tensor_parallel_size ","text":"In order to use vLLM multi-GPU and multi-node capabilities with ray , we need to do a few changes in the example pipeline from above. The first change needed is to specify a value for tensor_parallel_size aka \"In how many GPUs do I want you to load the model\", and the second one is to define ray as the distributed_executor_backend as the default one in vLLM is to use multiprocessing : with Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n extra_kwargs={\n \"tensor_parallel_size\": 8,\n \"distributed_executor_backend\": \"ray\",\n }\n )\n )\n\n load_data_from_hub >> text_generation\n More information about distributed inference with vLLM can be found here: vLLM - Distributed Serving "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/","title":"Serving an LLM for sharing it between several Task s","text":"It's very common to want to use the same LLM for several Task s in a pipeline. To avoid loading the LLM as many times as the number of Task s and avoid wasting resources, it's recommended to serve the model using solutions like text-generation-inference or vLLM , and then use an AsyncLLM compatible client like InferenceEndpointsLLM or OpenAILLM to communicate with the server respectively. "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-text-generation-inference","title":"Serving LLMs using text-generation-inference ","text":"model=meta-llama/Meta-Llama-3-8B-Instruct\nvolume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n\ndocker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \\\n -e HUGGING_FACE_HUB_TOKEN=<secret> \\\n ghcr.io/huggingface/text-generation-inference:2.0.4 \\\n --model-id $model\n Note The bash command above has been copy-pasted from the official docs text-generation-inference. Please refer to the official docs for more information. And then we can use InferenceEndpointsLLM with base_url=http://localhost:8080 (pointing to our TGI local deployment): from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n )\n\n # `base_url` points to the address of the `TGI` serving the LLM\n llm = InferenceEndpointsLLM(base_url=\"http://192.168.1.138:8080\")\n\n text_generation = TextGeneration(\n llm=llm,\n num_generations=3,\n group_generations=True,\n output_mappings={\"generation\": \"generations\"},\n )\n\n ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n load_data >> text_generation >> ultrafeedback\n "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-vllm","title":"Serving LLMs using vLLM ","text":"docker run --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n --env \"HUGGING_FACE_HUB_TOKEN=<secret>\" \\\n -p 8000:8000 \\\n --ipc=host \\\n vllm/vllm-openai:latest \\\n --model meta-llama/Meta-Llama-3-8B-Instruct\n Note The bash command above has been copy-pasted from the official docs vLLM. Please refer to the official docs for more information. And then we can use OpenAILLM with base_url=http://localhost:8000 (pointing to our vLLM local deployment): from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n )\n\n # `base_url` points to the address of the `vLLM` serving the LLM\n llm = OpenAILLM(base_url=\"http://192.168.1.138:8000\", model=\"\")\n\n text_generation = TextGeneration(\n llm=llm,\n num_generations=3,\n group_generations=True,\n output_mappings={\"generation\": \"generations\"},\n )\n\n ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n load_data >> text_generation >> ultrafeedback\n "},{"location":"sections/how_to_guides/advanced/structured_generation/","title":"Structured data generation","text":"Distilabel has integrations with relevant libraries to generate structured text i.e. to guide the LLM towards the generation of structured outputs following a JSON schema, a regex, etc. "},{"location":"sections/how_to_guides/advanced/structured_generation/#outlines","title":"Outlines","text":"Distilabel integrates outlines within some LLM subclasses. At the moment, the following LLMs integrated with outlines are supported in distilabel : TransformersLLM , vLLM or LlamaCppLLM , so that anyone can generate structured outputs in the form of JSON or a parseable regex. The LLM has an argument named structured_output 1 that determines how we can generate structured outputs with it, let's see an example using LlamaCppLLM . Note For outlines integration to work you may need to install the corresponding dependencies: pip install distilabel[outlines]\n "},{"location":"sections/how_to_guides/advanced/structured_generation/#json","title":"JSON","text":"We will start with a JSON example, where we initially define a pydantic.BaseModel schema to guide the generation of the structured output. Note Take a look at StructuredOutputType to see the expected format of the structured_output dict variable. from pydantic import BaseModel\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n And then we provide that schema to the structured_output argument of the LLM. from distilabel.models import LlamaCppLLM\n\nllm = LlamaCppLLM(\n model_path=\"./openhermes-2.5-mistral-7b.Q4_K_M.gguf\" # (1)\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": User},\n)\nllm.load()\n - We have previously downloaded a GGUF model i.e.
llama.cpp compatible, from the Hugging Face Hub using curl2, but any model can be used as replacement, as long as the model_path argument is updated. And we are ready to pass our instruction as usual: import json\n\nresult = llm.generate(\n [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n max_new_tokens=50\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'Kathy', 'last_name': 'Smith', 'id': 4539210}\nUser(**data)\n# User(name='Kathy', last_name='Smith', id=4539210)\n We get back a Python dictionary (formatted as a string) that we can parse using json.loads , or validate it directly using the User , which si a pydantic.BaseModel instance. "},{"location":"sections/how_to_guides/advanced/structured_generation/#regex","title":"Regex","text":"The following example shows an example of text generation whose output adhere to a regular expression: pattern = r\"<name>(.*?)</name>.*?<grade>(.*?)</grade>\" #\u00a0the same pattern for re.compile\n\nllm=LlamaCppLLM(\n model_path=model_path,\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"regex\", \"schema\": pattern},\n)\nllm.load()\n\nresult = llm.generate(\n [\n [\n {\"role\": \"system\", \"content\": \"You are Simpsons' fans who loves assigning grades from A to E, where A is the best and E is the worst.\"},\n {\"role\": \"user\", \"content\": \"What's up with Homer Simpson?\"}\n ]\n ],\n max_new_tokens=200\n)\n We can check the output by parsing the content using the same pattern we required from the LLM. import re\nmatch = re.search(pattern, result[0][0])\n\nif match:\n name = match.group(1)\n grade = match.group(2)\n print(f\"Name: {name}, Grade: {grade}\")\n# Name: Homer Simpson, Grade: C+\n These were some simple examples, but one can see the options this opens. Tip A full pipeline example can be seen in the following script: examples/structured_generation_with_outlines.py "},{"location":"sections/how_to_guides/advanced/structured_generation/#instructor","title":"Instructor","text":"For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like outlines does, but thanks to instructor we can generate structured output from LLM providers based on pydantic.BaseModel objects. We have integrated instructor to deal with the AsyncLLM . Note For instructor integration to work you may need to install the corresponding dependencies: pip install distilabel[instructor]\n Note Take a look at InstructorStructuredOutputType to see the expected format of the structured_output dict variable. The following is the same example you can see with outlines 's JSON section for comparison purposes. from pydantic import BaseModel\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n And then we provide that schema to the structured_output argument of the LLM: Note In this example we are using Meta Llama 3.1 8B Instruct, keep in mind not all the models support structured outputs. from distilabel.models import MistralLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\"schema\": User}\n)\nllm.load()\n And we are ready to pass our instructions as usual: import json\n\nresult = llm.generate(\n [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n max_new_tokens=256\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'John', 'last_name': 'Doe', 'id': 12345}\nUser(**data)\n# User(name='John', last_name='Doe', id=12345)\n We get back a Python dictionary (formatted as a string) that we can parse using json.loads , or validate it directly using the User , which is a pydantic.BaseModel instance. Tip A full pipeline example can be seen in the following script: examples/structured_generation_with_instructor.py "},{"location":"sections/how_to_guides/advanced/structured_generation/#openai-json","title":"OpenAI JSON","text":"OpenAI offers a JSON Mode to deal with structured output via their API, let's see how to make use of them. The JSON mode instructs the model to always return a JSON object following the instruction required. Warning Bear in mind, for this to work, you must instruct the model in some way to generate JSON, either in the system message or in the instruction, as can be seen in the API reference. Contrary to what we have via outlines , JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. More information can be found in the OpenAI documentation. Other than the reference to generating JSON, to ensure the model generates parseable JSON we can pass the argument response_format=\"json\" 3: from distilabel.models import OpenAILLM\nllm = OpenAILLM(model=\"gpt4-turbo\", api_key=\"api.key\")\nllm.generate(..., response_format=\"json\")\n -
You can check the variable type by importing it from: from distilabel.steps.tasks.structured_outputs.outlines import StructuredOutputType\n \u21a9 -
Download the model with curl: curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n \u21a9 -
Keep in mind that to interact with this response_format argument in a pipeline, you will have to pass it via the generation_kwargs : # Assuming a pipeline is already defined, and we have a task using OpenAILLM called `task_with_openai`:\npipeline.run(\n parameters={\n \"task_with_openai\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"response_format\": \"json\"\n }\n }\n }\n }\n)\n \u21a9 "},{"location":"sections/how_to_guides/advanced/cli/","title":"Command Line Interface (CLI)","text":"Distilabel offers a CLI to explore and re-run existing Pipeline dumps, meaning that an existing dump can be explored to see the steps, how those are connected, the runtime parameters used, and also re-run it with the same or different runtime parameters, respectively. "},{"location":"sections/how_to_guides/advanced/cli/#available-commands","title":"Available commands","text":"The only available command as of the current version of distilabel is distilabel pipeline . $ distilabel pipeline --help\n\n Usage: distilabel pipeline [OPTIONS] COMMAND [ARGS]...\n\n Commands to run and inspect Distilabel pipelines.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n\u256d\u2500 Commands \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 info Get information about a Distilabel pipeline. \u2502\n\u2502 run Run a Distilabel pipeline. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n So on, distilabel pipeline has two subcommands: info and run , as described below. Note that for testing purposes we will be using the following dataset. "},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-info","title":"distilabel pipeline info ","text":"$ distilabel pipeline info --help\n\n Usage: distilabel pipeline info [OPTIONS]\n\n Get information about a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 * --config TEXT Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502 [default: None] \u2502\n\u2502 [required] \u2502\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n As we can see from the help message, we need to pass either a Path or a URL . This second option comes handy for datasets stored in Hugging Face Hub, for example: distilabel pipeline info --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\"\n If we take a look: The pipeline information includes the steps used in the Pipeline along with the Runtime Parameter that was used, as well as a description of each of them, and also the connections between these steps. These can be helpful to explore the Pipeline locally. "},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run","title":"distilabel pipeline run ","text":"We can also run a Pipeline from the CLI just pointing to the same pipeline.yaml file or an URL pointing to it and calling distilabel pipeline run . Alternatively, an URL pointing to a Python script containing a distilabel pipeline can be used: $ distilabel pipeline run --help\n\n Usage: distilabel pipeline run [OPTIONS]\n\n Run a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --param PARSE_RUNTIME_PARAM [default: (dynamic)] \u2502\n\u2502 --config TEXT Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502 [default: None] \u2502\n\u2502 --script TEXT URL pointing to a python script containing a distilabel \u2502\n\u2502 pipeline. \u2502\n\u2502 [default: None] \u2502\n\u2502 --pipeline-variable-name TEXT Name of the pipeline in a script. I.e. the 'pipeline' \u2502\n\u2502 variable in `with Pipeline(...) as pipeline:...`. \u2502\n\u2502 [default: pipeline] \u2502\n\u2502 --ignore-cache --no-ignore-cache Whether to ignore the cache and re-run the pipeline from \u2502\n\u2502 scratch. \u2502\n\u2502 [default: no-ignore-cache] \u2502\n\u2502 --repo-id TEXT The Hugging Face Hub repository ID to push the resulting \u2502\n\u2502 dataset to. \u2502\n\u2502 [default: None] \u2502\n\u2502 --commit-message TEXT The commit message to use when pushing the dataset. \u2502\n\u2502 [default: None] \u2502\n\u2502 --private --no-private Whether to make the resulting dataset private on the Hub. \u2502\n\u2502 [default: no-private] \u2502\n\u2502 --token TEXT The Hugging Face Hub API token to use when pushing the \u2502\n\u2502 dataset. \u2502\n\u2502 [default: None] \u2502\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n Using --config option, we must pass a path with a pipeline.yaml file. To specify the runtime parameters of the steps we will need to use the --param option and the value of the parameter in the following format: distilabel pipeline run --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\" \\\n --param load_dataset.repo_id=distilabel-internal-testing/instruction-dataset-mini \\\n --param load_dataset.split=test \\\n --param generate_with_gpt35.llm.generation_kwargs.max_new_tokens=512 \\\n --param generate_with_gpt35.llm.generation_kwargs.temperature=0.7 \\\n --param to_argilla.dataset_name=text_generation_with_gpt35 \\\n --param to_argilla.dataset_workspace=admin\n Or using --script we can pass directly a remote python script (keep in mind --config and --script are exclusive): distilabel pipeline run --script \"https://huggingface.co/datasets/distilabel-internal-testing/pipe_nothing_test/raw/main/pipe_nothing.py\"\n You can also pass runtime parameters to the python script as we saw with --config option. Again, this helps with the reproducibility of the results, and simplifies sharing not only the final dataset but also the process to generate it. "},{"location":"sections/how_to_guides/basic/llm/","title":"Executing Tasks with LLMs","text":""},{"location":"sections/how_to_guides/basic/llm/#working-with-llms","title":"Working with LLMs","text":"LLM subclasses are designed to be used within a Task, but they can also be used standalone. from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n)\nllm.load()\n\nllm.generate_outputs(\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [\n# {\n# \"generations\": [\n# \"The capital of Spain is Madrid.\"\n# ],\n# \"statistics\": {\n# \"input_tokens\": [\n# 43\n# ],\n# \"output_tokens\": [\n# 8\n# ]\n# }\n# }\n# ]\n Note Always call the LLM.load or Task.load method when using LLMs standalone or as part of a Task . If using a Pipeline , this is done automatically in Pipeline.run() . New in version 1.5.0 Since version 1.5.0 the LLM output is a list of dictionaries (one per item in the inputs ), each containing generations , that reports the text returned by the LLM , and a statistics field that will store statistics related to the LLM generation. Initially, this will include input_tokens and output_tokens when available, which will be obtained via the API when available, or if a tokenizer is available for the model used, using the tokenizer for the model. This data will be moved by the corresponding Task during the pipeline processing and moved to distilabel_metadata so we can operate on this data if we want, like for example computing the number of tokens per dataset. To access to the previous result one just has to access to the generations in the resulting dictionary: result[0][\"generations\"] . "},{"location":"sections/how_to_guides/basic/llm/#offline-batch-generation","title":"Offline Batch Generation","text":"By default, all LLM s will generate text in a synchronous manner i.e. send inputs using generate_outputs method that will get blocked until outputs are generated. There are some LLM s (such as OpenAILLM) that implements what we denote as offline batch generation, which allows to send the inputs to the LLM-as-a-service which will generate the outputs asynchronously and give us a job id that we can use later to check the status and retrieve the generated outputs when they are ready. LLM-as-a-service platforms offers this feature as a way to save costs in exchange of waiting for the outputs to be generated. To use this feature in distilabel the only thing we need to do is to set the use_offline_batch_generation attribute to True when creating the LLM instance: from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n model=\"gpt-4o\",\n use_offline_batch_generation=True,\n)\n\nllm.load()\n\nllm.jobs_ids # (1)\n# None\n\nllm.generate_outputs( # (2)\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# DistilabelOfflineBatchGenerationNotFinishedException: Batch generation with jobs_ids=('batch_OGB4VjKpu2ay9nz3iiFJxt5H',) is not finished\n\nllm.jobs_ids # (3)\n# ('batch_OGB4VjKpu2ay9nz3iiFJxt5H',)\n\n\nllm.generate_outputs( # (4)\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [{'generations': ['The capital of Spain is Madrid.'],\n# 'statistics': {'input_tokens': [13], 'output_tokens': [7]}}]\n - At first the
jobs_ids attribute is None . - The first call to
generate_outputs will send the inputs to the LLM-as-a-service and return a DistilabelOfflineBatchGenerationNotFinishedException since the outputs are not ready yet. - After the first call to
generate_outputs the jobs_ids attribute will contain the job ids created for generating the outputs. - The second call or subsequent calls to
generate_outputs will return the outputs if they are ready or raise a DistilabelOfflineBatchGenerationNotFinishedException if they are not ready yet. The offline_batch_generation_block_until_done attribute can be used to block the generate_outputs method until the outputs are ready polling the platform the specified amount of seconds. from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n model=\"gpt-4o\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\nllm.load()\n\nllm.generate_outputs(\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [{'generations': ['The capital of Spain is Madrid.'],\n# 'statistics': {'input_tokens': [13], 'output_tokens': [7]}}]\n "},{"location":"sections/how_to_guides/basic/llm/#within-a-task","title":"Within a Task","text":"Pass the LLM as an argument to the Task , and the task will handle the rest. from distilabel.models import OpenAILLM\nfrom distilabel.steps.tasks import TextGeneration\n\nllm = OpenAILLM(model=\"gpt-4o-mini\")\ntask = TextGeneration(name=\"text_generation\", llm=llm)\n\ntask.load()\n\nnext(task.process(inputs=[{\"instruction\": \"What's the capital of Spain?\"}]))\n# [{'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text_generation': 'The capital of Spain is Madrid.',\n# 'raw_input_text_generation': [{'role': 'user',\n# 'content': \"What's the capital of Spain?\"}],\n# 'statistics_text_generation': {'input_tokens': 13, 'output_tokens': 7}},\n# 'model_name': 'gpt-4o-mini'}]\n Note As mentioned in Working with LLMs section, the generation of an LLM is automatically moved to distilabel_metadata to avoid interference with the common workflow, so the addition of the statistics it's an extra component available for the user, but nothing has to be changed in the defined pipelines. "},{"location":"sections/how_to_guides/basic/llm/#runtime-parameters","title":"Runtime Parameters","text":"LLMs can have runtime parameters, such as generation_kwargs , provided via the Pipeline.run() method using the params argument. Note Runtime parameters can differ between LLM subclasses, caused by the different functionalities offered by the LLM providers. from distilabel.pipeline import Pipeline\nfrom distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[{\"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\"}],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4o-mini\"),\n )\n\n load_dataset >> text_generation\n\nif __name__ == \"__main__\":\n pipeline.run(\n parameters={\n text_generation.name: {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.3}}},\n },\n )\n "},{"location":"sections/how_to_guides/basic/llm/#creating-custom-llms","title":"Creating custom LLMs","text":"To create custom LLMs, subclass either LLM for synchronous or AsyncLLM for asynchronous LLMs. Implement the following methods: -
model_name : A property containing the model's name. -
generate : A method that takes a list of prompts and returns generated texts. -
agenerate : A method that takes a single prompt and returns generated texts. This method is used within the generate method of the AsyncLLM class. -
(optional) get_last_hidden_state : is a method that will take a list of prompts and return a list of hidden states. This method is optional and will be used by some tasks such as the GenerateEmbeddings task. Custom LLMCustom AsyncLLM from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import LLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomLLM(LLM):\n @property\n def model_name(self) -> str:\n return \"my-model\"\n\n @validate_call\n def generate(self, inputs: List[ChatType], num_generations: int = 1, **kwargs: Any) -> List[GenerateOutput]:\n for _ in range(num_generations):\n ...\n\n def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n ...\n from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import AsyncLLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomAsyncLLM(AsyncLLM):\n @property\n def model_name(self) -> str:\n return \"my-model\"\n\n @validate_call\n async def agenerate(self, input: ChatType, num_generations: int = 1, **kwargs: Any) -> GenerateOutput:\n for _ in range(num_generations):\n ...\n\n def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n ...\n generate and agenerate keyword arguments (but input and num_generations ) are considered as RuntimeParameter s, so a value can be passed to them via the parameters argument of the Pipeline.run method. Note To have the arguments of the generate and agenerate coerced to the expected types, the validate_call decorator is used, which will automatically coerce the arguments to the expected types, and raise an error if the types are not correct. This is specially useful when providing a value for an argument of generate or agenerate from the CLI, since the CLI will always provide the arguments as strings. Warning Additional LLMs created in distilabel will have to take into account how the statistics are generated to properly include them in the LLM output. "},{"location":"sections/how_to_guides/basic/llm/#available-llms","title":"Available LLMs","text":"Our LLM gallery shows a list of the available LLMs that can be used within the distilabel library. "},{"location":"sections/how_to_guides/basic/pipeline/","title":"Execute Steps and Tasks in a Pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#how-to-create-a-pipeline","title":"How to create a pipeline","text":"Pipeline organise the Steps and Tasks in a sequence, where the output of one step is the input of the next one. A Pipeline should be created by making use of the context manager along with passing a name, and optionally a description. from distilabel.pipeline import Pipeline\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n "},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-stepconnect-method","title":"Connecting steps with the Step.connect method","text":"Now, we can define the steps of our Pipeline . Note Steps without predecessors (i.e. root steps), need to be GeneratorStep s such as LoadDataFromDicts or LoadDataFromHub . After this, other steps can be defined. from distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n ...\n Easily load your datasets If you are already used to work with Hugging Face's Dataset via load_dataset or pd.DataFrame , you can create the GeneratorStep directly from the dataset (or dataframe), and create the step with the help of make_generator_step : From a list of dictsFrom datasets.Dataset From pd.DataFrame from distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = [{\"instruction\": \"Tell me a joke.\"}]\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n from datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = load_dataset(\n \"DIBT/10k_prompts_ranked\",\n split=\"train\"\n).filter(\n lambda r: r[\"avg_rating\"]>=4 and r[\"num_responses\"]>=2\n).select(range(500))\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n import pandas as pd\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = pd.read_csv(\"path/to/dataset.csv\")\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n Next, we will use prompt column from the dataset obtained through LoadDataFromHub and use several LLM s to execute a TextGeneration task. We will also use the Task.connect() method to connect the steps, so the output of one step is the input of the next one. Note The order of the execution of the steps will be determined by the connections of the steps. In this case, the TextGeneration tasks will be executed after the LoadDataFromHub step. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n task.connect(load_dataset)\n\n ...\n For each row of the dataset, the TextGeneration task will generate a text based on the instruction column and the LLM model, and store the result (a single string) in a new column called generation . Because we need to have the response s in the same column, we will add GroupColumns to combine them all in the same column as a list of strings. Note In this case, the GroupColumns tasks will be executed after all TextGeneration steps. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n load_dataset.connect(task)\n task.connect(combine_generations)\n "},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-operator","title":"Connecting steps with the >> operator","text":"Besides the Step.connect method: step1.connect(step2) , there's an alternative way by making use of the >> operator. We can connect steps in a more readable way, and it's also possible to connect multiple steps at once. Step per stepMultiple steps at once Each call to step1.connect(step2) has been exchanged by step1 >> step2 within the loop. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n load_dataset >> task >> combine_generations\n Each task is first appended to a list, and then all the calls to connections are done in a single call. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n load_dataset >> tasks >> combine_generations\n "},{"location":"sections/how_to_guides/basic/pipeline/#routing-batches-to-specific-downstream-steps","title":"Routing batches to specific downstream steps","text":"In some pipelines, you may want to send batches from a single upstream step to specific downstream steps based on certain conditions. To achieve this, you can use a routing_batch_function . This function takes a list of downstream steps and returns a list of step names to which each batch should be routed. Let's update the example above to route the batches loaded by the LoadDataFromHub step to just 2 of the TextGeneration tasks. First, we will create our custom routing_batch_function , and then we will update the pipeline to use it: import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n return random.sample(steps, 2)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n load_dataset >> sample_two_steps >> tasks >> combine_generations\n The routing_batch_function that we just built is a common one, so distilabel comes with a builtin function that can be used to achieve the same behavior: from distilable.pipeline import sample_n_steps\n\nsample_two_steps = sample_n_steps(2)\n "},{"location":"sections/how_to_guides/basic/pipeline/#running-the-pipeline","title":"Running the pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#pipelinedry_run","title":"Pipeline.dry_run","text":"Before running the Pipeline we can check if the pipeline is valid using the Pipeline.dry_run() method. It takes the same parameters as the run method which we will discuss in the following section, plus the batch_size we want the dry run to use (by default set to 1). with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.dry_run(parameters=..., batch_size=1)\n "},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun","title":"Pipeline.run","text":"After testing, we can now execute the full Pipeline using the Pipeline.run() method. with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n \"load_dataset\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n \"text_generation_with_gpt-4-0125-preview\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_mistral-large-2402\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_gemini-1.0-pro\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n But if we run the pipeline above, we will see that the run method will fail: ValueError: Step 'text_generation_with_gpt-4-0125-preview' requires inputs ['instruction'], but only the inputs=['prompt', 'completion', 'meta'] are available, which means that the inputs=['instruction'] are missing or not available\nwhen the step gets to be executed in the pipeline. Please make sure previous steps to 'text_generation_with_gpt-4-0125-preview' are generating the required inputs.\n This is because, before actually running the pipeline, we must ensure each step has the necessary input columns to be executed. In this case, the TextGeneration task requires the instruction column, but the LoadDataFromHub step generates the prompt column. To solve this, we can use the output_mappings or input_mapping arguments of individual Step s, to map columns from one step to another. with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"}\n )\n\n ...\n If we execute the pipeline again, it will run successfully and we will have a Distiset with the outputs of all the leaf steps of the pipeline which we can push to the Hugging Face Hub. if __name__ == \"__main__\":\n distiset = pipeline.run(...)\n distiset.push_to_hub(\"distilabel-internal-testing/instruction-dataset-mini-with-generations\")\n "},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun-with-a-dataset","title":"Pipeline.run with a dataset","text":"Note that in most cases if you don't need the extra flexibility the GeneratorSteps bring you, you can create a dataset as you would normally do and pass it to the Pipeline.run method directly. Look at the highlighted lines to see the updated lines: import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n return random.sample(steps, 2)\n\ndataset = load_dataset(\n \"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\"\n)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n sample_two_steps >> tasks >> combine_generations\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n dataset=dataset,\n parameters=...\n )\n "},{"location":"sections/how_to_guides/basic/pipeline/#stopping-the-pipeline","title":"Stopping the pipeline","text":"In case you want to stop the pipeline while it's running, you can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process), and the outputs will be stored in the cache. Pressing an additional time will force the pipeline to stop its execution, but this can lead to losing the generated outputs for certain batches. "},{"location":"sections/how_to_guides/basic/pipeline/#cache","title":"Cache","text":"If for some reason, the pipeline execution stops (for example by pressing Ctrl+C ), the state of the pipeline and the outputs will be stored in the cache, so we can resume the pipeline execution from the point where it was stopped. If we want to force the pipeline to run again without can, then we can use the use_cache argument of the Pipeline.run() method: if __name__ == \"__main__\":\n distiset = pipeline.run(parameters={...}, use_cache=False)\n Note For more information on caching, we refer the reader to the caching section. "},{"location":"sections/how_to_guides/basic/pipeline/#adjusting-the-batch-size-for-each-step","title":"Adjusting the batch size for each step","text":"Memory issues can arise when processing large datasets or when using large models. To avoid this, we can use the input_batch_size argument of individual tasks. TextGeneration task will receive 5 dictionaries, while the LoadDataFromHub step will send 10 dictionaries per batch: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n batch_size=10\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(\n name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\",\n llm=llm,\n input_batch_size=5,\n )\n\n ...\n "},{"location":"sections/how_to_guides/basic/pipeline/#serializing-the-pipeline","title":"Serializing the pipeline","text":"Sharing a pipeline with others is very easy, as we can serialize the pipeline object using the save method. We can save the pipeline in different formats, such as yaml or json : yamljson if __name__ == \"__main__\":\n pipeline.save(\"pipeline.yaml\", format=\"yaml\")\n if __name__ == \"__main__\":\n pipeline.save(\"pipeline.json\", format=\"json\")\n To load the pipeline, we can use the from_yaml or from_json methods: yamljson pipeline = Pipeline.from_yaml(\"pipeline.yaml\")\n pipeline = Pipeline.from_json(\"pipeline.json\")\n Serializing the pipeline is very useful when we want to share the pipeline with others, or when we want to store the pipeline for future use. It can even be hosted online, so the pipeline can be executed directly using the CLI. "},{"location":"sections/how_to_guides/basic/pipeline/#visualizing-the-pipeline","title":"Visualizing the pipeline","text":"We can visualize the pipeline using the Pipeline.draw() method. This will create a mermaid graph, and return the path to the image. path_to_image = pipeline.draw(\n top_to_bottom=True,\n show_edge_labels=True,\n)\n Within notebooks, we can simply call pipeline and the graph will be displayed. Alternatively, we can use the Pipeline.draw() method to have more control over the graph visualization and use IPython to display it. from IPython.display import Image, display\n\ndisplay(Image(path_to_image))\n Let's now see how the pipeline of the fully working example looks like. "},{"location":"sections/how_to_guides/basic/pipeline/#fully-working-example","title":"Fully working example","text":"To sum up, here is the full code of the pipeline we have created in this section. Note that you will need to change the name of the Hugging Face repository where the resulting will be pushed, set OPENAI_API_KEY environment variable, set MISTRAL_API_KEY and have gcloud installed and configured: Code from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n task = TextGeneration(\n name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\", llm=llm\n )\n load_dataset.connect(task)\n task.connect(combine_generations)\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n \"load_dataset\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n \"text_generation_with_gpt-4-0125-preview\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_mistral-large-2402\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_gemini-1.0-pro\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n distiset.push_to_hub(\n \"distilabel-internal-testing/instruction-dataset-mini-with-generations\"\n )\n "},{"location":"sections/how_to_guides/basic/step/","title":"Steps for processing data","text":""},{"location":"sections/how_to_guides/basic/step/#working-with-steps","title":"Working with Steps","text":"The Step is intended to be used within the scope of a Pipeline , which will orchestrate the different steps defined but can also be used standalone. Assuming that we have a Step already defined as it follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass MyStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"input_field\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"output_field\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"output_field\"] = input[\"input_field\"]\n yield inputs\n Then we can use it as follows: step = MyStep(name=\"my-step\")\nstep.load()\n\nnext(step.process([{\"input_field\": \"value\"}]))\n# [{'input_field': 'value', 'output_field': 'value'}]\n Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/step/#arguments","title":"Arguments","text":" -
input_mappings , is a dictionary that maps keys from the input dictionaries to the keys expected by the step. For example, if input_mappings={\"instruction\": \"prompt\"} , means that the input key prompt will be used as the key instruction for current step. -
output_mappings , is a dictionary that can be used to map the outputs of the step to other names. For example, if output_mappings={\"conversation\": \"prompt\"} , means that output key conversation will be renamed to prompt for the next step. -
input_batch_size (by default set to 50), is independent for every step and will determine how many input dictionaries will process at once. "},{"location":"sections/how_to_guides/basic/step/#runtime-parameters","title":"Runtime parameters","text":"Step s can also have RuntimeParameter , which are parameters that can only be used after the pipeline initialisation when calling the Pipeline.run . from distilabel.mixins.runtime_parameters import RuntimeParameter\n\nclass Step(...):\n input_batch_size: RuntimeParameter[PositiveInt] = Field(\n default=DEFAULT_INPUT_BATCH_SIZE,\n description=\"The number of rows that will contain the batches processed by the\"\n \" step.\",\n )\n "},{"location":"sections/how_to_guides/basic/step/#types-of-steps","title":"Types of Steps","text":"There are two special types of Step in distilabel : -
GeneratorStep : is a step that only generates data, and it doesn't need any input data from previous steps and normally is the first node in a Pipeline . More information: Components -> Step - GeneratorStep. -
GlobalStep : is a step with the standard interface i.e. receives inputs and generates outputs, but it processes all the data at once, and often is the final step in the Pipeline . The fact that a GlobalStep requires the previous steps to finish before being able to start. More information: Components - Step - GlobalStep. -
Task , is essentially the same as a default Step , but it relies on an LLM as an attribute, and the process method will be in charge of calling that LLM. More information: Components - Task. "},{"location":"sections/how_to_guides/basic/step/#defining-custom-steps","title":"Defining custom Steps","text":"We can define a custom step by creating a new subclass of the Step and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data. Note The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput . The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from Step Using the @step decorator We can inherit from the Step class and define the inputs , outputs , and process methods as follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n for upstream_step_inputs in inputs:\n ...\n yield item\n\n # When overridden (ideally under the `typing_extensions.override` decorator)\n # @typing_extensions.override\n # def process(self, inputs: StepInput) -> StepOutput:\n # for input in inputs:\n # ...\n # yield inputs\n The @step decorator will take care of the boilerplate code, and will allow to define the inputs , outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom Step subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...])\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n ...\n yield inputs\n\nstep = CustomStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/step/generator_step/","title":"GeneratorStep","text":"The GeneratorStep is a subclass of Step that is intended to be used as the first step within a Pipeline , because it doesn't require input and generates data that can be used by other steps. Alternatively, it can also be used as a standalone. from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n instructions: List[str]\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n if offset:\n self.instructions = self.instructions[offset:]\n\n while self.instructions:\n batch = [\n {\n \"instruction\": instruction\n } for instruction in self.instructions[: self.batch_size]\n ]\n self.instructions = self.instructions[self.batch_size :]\n yield (\n batch,\n True if len(self.instructions) == 0 else False,\n )\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"instruction\"]\n Then we can use it as follows: step = MyGeneratorStep(\n name=\"my-generator-step\",\n instructions=[\"Tell me a joke.\", \"Tell me a story.\"],\n batch_size=1,\n)\nstep.load()\n\nnext(step.process(offset=0))\n# ([{'instruction': 'Tell me a joke.'}], False)\nnext(step.process(offset=1))\n# ([{'instruction': 'Tell me a story.'}], True)\n Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/step/generator_step/#defining-custom-generatorsteps","title":"Defining custom GeneratorSteps","text":"We can define a custom generator step by creating a new subclass of the GeneratorStep and defining the following: -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that yields output data and a boolean flag indicating whether that's the last batch to be generated. Note The default signature for the process method is process(self, offset: int = 0) -> GeneratorStepOutput . The argument offset should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from GeneratorStep Using the @step decorator We can inherit from the GeneratorStep class and define the outputs , and process methods as follows: from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n instructions: List[str]\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n The @step decorator will take care of the boilerplate code, and will allow to define the outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GeneratorStep subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import GeneratorStepOutput\n\n@step(outputs=[...], step_type=\"generator\")\ndef CustomGeneratorStep(offset: int = 0) -> \"GeneratorStepOutput\":\n yield (\n ...,\n True if offset == 10 else False,\n )\n\nstep = CustomGeneratorStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/step/global_step/","title":"GlobalStep","text":"The GlobalStep is a subclass of Step that is used to define a step that requires the previous steps to be completed to run, since it will wait until all the input batches are received before running. This step is useful when you need to run a step that requires all the input data to be processed before running. Alternatively, it can also be used as a standalone. "},{"location":"sections/how_to_guides/basic/step/global_step/#defining-custom-globalsteps","title":"Defining custom GlobalSteps","text":"We can define a custom step by creating a new subclass of the GlobalStep and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data. Note The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput . The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom GlobalStep subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from GlobalStep Using the @step decorator We can inherit from the GlobalStep class and define the inputs , outputs , and process methods as follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n\n def process(self, *inputs: StepInput) -> StepOutput:\n for upstream_step_inputs in inputs:\n for item in input:\n ...\n yield item\n\n # When overridden (ideally under the `typing_extensions.override` decorator)\n # @typing_extensions.override\n # def process(self, inputs: StepInput) -> StepOutput:\n # for input in inputs:\n # ...\n # yield inputs\n The @step decorator will take care of the boilerplate code, and will allow to define the inputs , outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GlobalStep subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...], step_type=\"global\")\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n ...\n yield inputs\n\nstep = CustomStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/task/","title":"Tasks for generating and judging with LLMs","text":""},{"location":"sections/how_to_guides/basic/task/#working-with-tasks","title":"Working with Tasks","text":"The Task is a special kind of Step that includes the LLM as a mandatory argument. As with a Step , it is normally used within a Pipeline but can also be used standalone. For example, the most basic task is the TextGeneration task, which generates text based on a given instruction. from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# \"instruction\": \"What's the capital of Spain?\",\n# \"generation\": \"The capital of Spain is Madrid.\",\n# \"distilabel_metadata\": {\n# \"raw_output_text-generation\": \"The capital of Spain is Madrid.\",\n# \"raw_input_text-generation\": [\n# {\n# \"role\": \"user\",\n# \"content\": \"What's the capital of Spain?\"\n# }\n# ],\n# \"statistics_text-generation\": { # (1)\n# \"input_tokens\": 18,\n# \"output_tokens\": 8\n# }\n# },\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n# }\n# ]\n - The
LLMs will not only return the text but also a statistics_{STEP_NAME} field that will contain statistics related to the generation. If available, at least the input and output tokens will be returned. Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. As shown above, the TextGeneration task adds a generation based on the instruction . New in version 1.2.0 Since version 1.2.0 , we provide some metadata about the LLM call through distilabel_metadata . This can be disabled by setting the add_raw_output attribute to False when creating the task. Additionally, since version 1.4.0 , the formatted input can also be included, which can be helpful when testing custom templates (testing the pipeline using the dry_run method). disable raw input and outputtask = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n add_raw_output=False,\n add_raw_input=False\n)\n New in version 1.5.0 Since version 1.5.0 distilabel_metadata includes a new statistics field out of the box. The generation from the LLM will not only contain the text, but also statistics associated with the text if available, like the input and output tokens. This field will be generated with statistic_{STEP_NAME} to avoid collisions between different steps in the pipeline, similar to how raw_output_{STEP_NAME} works. "},{"location":"sections/how_to_guides/basic/task/#taskprint","title":"Task.print","text":"New in version 1.4.0 New since version 1.4.0 , Task.print Task.print method. The Tasks include a handy method to show what the prompt formatted for an LLM would look like, let's see an example with UltraFeedback , but it applies to any other Task . from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\nuf = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n)\nuf.load()\nuf.print()\n The result will be a rendered prompt, with the System prompt (if contained for the task) and the User prompt, rendered with rich (it will show exactly the same in a jupyter notebook). In case you want to test with a custom input, you can pass an example to the tasks format_input` method (or generate it on your own depending on the task), and pass it to the print method so that it shows your example: uf.print(\n uf.format_input({\"instruction\": \"test\", \"generations\": [\"1\", \"2\"]})\n)\n Using a DummyLLM to avoid loading one In case you don't want to load an LLM to render the template, you can create a dummy one like the ones we could use for testing. from distilabel.models import LLM\nfrom distilabel.models.mixins import MagpieChatTemplateMixin\n\nclass DummyLLM(AsyncLLM, MagpieChatTemplateMixin):\n structured_output: Any = None\n magpie_pre_query_template: str = \"llama3\"\n\n def load(self) -> None:\n pass\n\n @property\n def model_name(self) -> str:\n return \"test\"\n\n def generate(\n self, input: \"FormattedInput\", num_generations: int = 1\n ) -> \"GenerateOutput\":\n return [\"output\" for _ in range(num_generations)]\n You can use this LLM just as any of the other ones to load your task and call print : uf = UltraFeedback(llm=DummyLLM())\nuf.load()\nuf.print()\n Note When creating a custom task, the print method will be available by default, but it is limited to the most common scenarios for the inputs. If you test your new task and find it's not working as expected (for example, if your task contains one input consisting of a list of texts instead of a single one), you should override the _sample_input method. You can inspect the UltraFeedback source code for this. "},{"location":"sections/how_to_guides/basic/task/#specifying-the-number-of-generations-and-grouping-generations","title":"Specifying the number of generations and grouping generations","text":"All the Task s have a num_generations attribute that allows defining the number of generations that we want to have per input. We can update the example above to generate 3 completions per input: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n num_generations=3,\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# },\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# },\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# }\n# ]\n In addition, we might want to group the generations in a single output row as maybe one downstream step expects a single row with multiple generations. We can achieve this by setting the group_generations attribute to True : from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n num_generations=3,\n group_generations=True\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': ['The capital of Spain is Madrid.', 'The capital of Spain is Madrid.', 'The capital of Spain is Madrid.'],\n# 'distilabel_metadata': [\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'}\n# ],\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# }\n# ]\n "},{"location":"sections/how_to_guides/basic/task/#defining-custom-tasks","title":"Defining custom Tasks","text":"We can define a custom step by creating a new subclass of the Task and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
format_input : is a method that receives a dictionary with the input data and returns a ChatType following the chat-completion OpenAI message formatting. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. This property should always include model_name as one of the outputs since that's automatically injected from the LLM. -
format_output : is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that there's no need to include the model_name in the output. Inherit from Task Using the @task decorator When using the Task class inheritance method for creating a custom task, we can also optionally override the Task.process method to define a more complex processing logic involving an LLM , as the default one just calls the LLM.generate method once previously formatting the input and subsequently formatting the output. For example, EvolInstruct task overrides this method to call the LLM.generate multiple times (one for each evolution). from typing import Any, Dict, List, Union, TYPE_CHECKING\n\nfrom distilabel.steps.tasks import Task\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns\n from distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(Task):\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"input_field\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n return [\n {\n \"role\": \"user\",\n \"content\": input[\"input_field\"],\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n If your task just needs a system prompt, a user message template and a way to format the output given by the LLM , then you can use the @task decorator to avoid writing too much boilerplate code. from typing import Any, Dict, Union\nfrom distilabel.steps.tasks import task\n\n\n@task(inputs=[\"input_field\"], outputs=[\"output_field\"])\ndef MyCustomTask(output: Union[str, None], input: Union[Dict[str, Any], None] = None) -> Dict[str, Any]:\n \"\"\"\n ---\n system_prompt: |\n My custom system prompt\n\n user_message_template: |\n My custom user message template: {input_field}\n ---\n \"\"\"\n # Format the `LLM` output here\n return {\"output_field\": output}\n Warning Most Tasks reuse the Task.process method to process the generations, but if a new Task defines a custom process method, like happens for example with Magpie , one hast to deal with the statistics returned by the LLM . "},{"location":"sections/how_to_guides/basic/task/generator_task/","title":"GeneratorTask that produces output","text":""},{"location":"sections/how_to_guides/basic/task/generator_task/#working-with-generatortasks","title":"Working with GeneratorTasks","text":"The GeneratorTask is a custom implementation of a Task based on the GeneratorStep . As with a Task , it is normally used within a Pipeline but can also be used standalone. Warning This task is still experimental and may be subject to changes in the future. from typing import Any, Dict, List, Union\nfrom typing_extensions import override\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import GeneratorOutput\n\n\nclass MyCustomTask(GeneratorTask):\n instruction: str\n\n @override\n def process(self, offset: int = 0) -> GeneratorOutput:\n output = self.llm.generate(\n inputs=[\n [\n {\"role\": \"user\", \"content\": self.instruction},\n ],\n ],\n )\n output = {\"model_name\": self.llm.model_name}\n output.update(\n self.format_output(output=output, input=None)\n )\n yield output\n\n @property\n def outputs(self) -> List[str]:\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n We can then use it as follows: task = MyCustomTask(\n name=\"custom-generation\",\n instruction=\"Tell me a joke.\",\n llm=OpenAILLM(model=\"gpt-4\"),\n)\ntask.load()\n\nnext(task.process())\n# [{'output_field\": \"Why did the scarecrow win an award? Because he was outstanding!\", \"model_name\": \"gpt-4\"}]\n Note Most of the times you would need to override the default process method, as it's suited for the standard Task and not for the GeneratorTask . But within the context of the process function you can freely use the llm to generate data in any way. Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/task/generator_task/#defining-custom-generatortasks","title":"Defining custom GeneratorTasks","text":"We can define a custom generator task by creating a new subclass of the GeneratorTask and defining the following: -
process : is a method that generates the data based on the LLM and the instruction provided within the class instance, and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that the inputs argument is not allowed in this function since this is a GeneratorTask . The signature only expects the offset argument, which is used to keep track of the current iteration in the generator. -
outputs : is a property that returns a list of strings with the names of the output fields, this property should always include model_name as one of the outputs since that's automatically injected from the LLM. -
format_output : is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that there's no need to include the model_name in the output. from typing import Any, Dict, List, Union\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(GeneratorTask):\n @override\n def process(self, offset: int = 0) -> GeneratorOutput:\n output = self.llm.generate(\n inputs=[\n [{\"role\": \"user\", \"content\": \"Tell me a joke.\"}],\n ],\n )\n output = {\"model_name\": self.llm.model_name}\n output.update(\n self.format_output(output=output, input=None)\n )\n yield output\n\n @property\n def outputs(self) -> List[str]:\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n "},{"location":"sections/pipeline_samples/","title":"Tutorials","text":" - End-to-end tutorials provide detailed step-by-step explanations and the code used for end-to-end workflows.
- Paper implementations provide reproductions of fundamental papers in the synthetic data domain.
- Examples don't provide explenations but simply show code for different tasks.
"},{"location":"sections/pipeline_samples/#end-to-end-tutorials","title":"End-to-end tutorials","text":" -
Generate a preference dataset Learn about synthetic data generation for ORPO and DPO. Tutorial -
Clean an existing preference dataset Learn about how to provide AI feedback to clean an existing dataset. Tutorial -
Retrieval and reranking models Learn about synthetic data generation for fine-tuning custom retrieval and reranking models. Tutorial -
Generate text classification data Learn about how synthetic data generation for text classification can help address data imbalance or scarcity. Tutorial "},{"location":"sections/pipeline_samples/#paper-implementations","title":"Paper Implementations","text":" -
Deepseek Prover Learn about an approach to generate mathematical proofs for theorems generated from informal math problems. Example -
DEITA Learn about prompt, response tuning for complexity and quality and LLMs as judges for automatic data selection. Paper -
Instruction Backtranslation Learn about automatically labeling human-written text with corresponding instructions. Paper -
Prometheus 2 Learn about using open-source models as judges for direct assessment and pair-wise ranking. Paper -
UltraFeedback Learn about a large-scale, fine-grained, diverse preference dataset, used for training powerful reward and critic models. Paper -
APIGen Learn how to create verifiable high-quality datases for function-calling applications. Paper -
CLAIR Learn Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs. Paper -
Math Shepherd Learn about Math-Shepherd, a framework to generate datasets to train process reward models (PRMs) which assign reward scores to each step of math problem solutions. Paper "},{"location":"sections/pipeline_samples/#examples","title":"Examples","text":" -
Benchmarking with distilabel Learn about reproducing the Arena Hard benchmark with disitlabel. Example -
Structured generation with outlines Learn about generating RPG characters following a pydantic.BaseModel with outlines in distilabel. Example -
Structured generation with instructor Learn about answering instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel. Example -
Create a social network with FinePersonas Learn how to leverage FinePersonas to create a synthetic social network and fine-tune adapters for Multi-LoRA. Example -
Create questions and answers for a exam Learn how to generate questions and answers for a exam, using a raw wikipedia page and structured generation. Example -
Text generation with images in distilabel Ask questions about images using distilabel. Example "},{"location":"sections/pipeline_samples/examples/benchmarking_with_distilabel/","title":"Benchmarking with distilabel ","text":"Benchmark LLMs with distilabel : reproducing the Arena Hard benchmark. The script below first defines both the ArenaHard and the ArenaHardResults tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a Pipeline to run the generation on top of the prompts with InferenceEndpointsLLM while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with OpenAILLM generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie. To run this example you will first need to install the Arena Hard optional dependencies, being pandas , scikit-learn , and numpy . Run python examples/arena_hard.py\n arena_hard.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom typing_extensions import override\n\nfrom distilabel.steps import GlobalStep, StepInput\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import StepOutput\n\n\nclass ArenaHard(Task):\n \"\"\"Evaluates two assistant responses using an LLM as judge.\n\n This `Task` is based on the \"From Live Data to High-Quality Benchmarks: The\n Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n instruction-tuned LLMs that contains 500 challenging user queries. GPT-4 is used\n as the judge to compare the model responses against a baseline model, which defaults\n to `gpt-4-0314`.\n\n Note:\n Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n among popular open-ended LLM benchmarks.\n\n Input columns:\n - instruction (`str`): The instruction to evaluate the responses.\n - generations (`List[str]`): The responses generated by two, and only two, LLMs.\n\n Output columns:\n - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n - score (`str`): The score extracted from the evaluation.\n - model_name (`str`): The model name used to generate the evaluation.\n\n Categories:\n - benchmark\n\n References:\n - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n Examples:\n\n Evaluate two assistant responses for a given instruction using Arean Hard prompts:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import GroupColumns, LoadDataFromDicts\n from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n with Pipeline() as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"What is the capital of France?\"}],\n )\n\n text_generation_a = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n text_generation_b = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine = GroupColumns(\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n llm=..., # LLM instance\n )\n\n load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard\n ```\n \"\"\"\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs required by this task are the `instruction` and the `generations`,\n which are the responses generated by two, and only two, LLMs.\"\"\"\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"This method formats the input data as a `ChatType` using the prompt defined\n by the Arena Hard benchmark, which consists on a `system_prompt` plus a template\n for the user first message that contains the `instruction` and both `generations`.\n \"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": \"Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\\n\\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\\n\\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\\n\\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\\n\\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\\n\\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\\n\\n1. Assistant A is significantly better: [[A>>B]]\\n2. Assistant A is slightly better: [[A>B]]\\n3. Tie, relatively the same: [[A=B]]\\n4. Assistant B is slightly better: [[B>A]]\\n5. Assistant B is significantly better: [[B>>A]]\\n\\nExample output: \\\"My final verdict is tie: [[A=B]]\\\".\",\n },\n {\n \"role\": \"user\",\n \"content\": f\"<|User Prompt|>\\n{input['instruction']}\\n\\n<|The Start of Assistant A's Answer|>\\n{input['generations'][0]}\\n<|The End of Assistant A's Answer|>\\n\\n<|The Start of Assistant B's Answer|>\\n{input['generations'][1]}\\n<|The End of Assistant B's Answer|>\",\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The outputs generated by this task are the `evaluation`, the `score` and\n the `model_name` (which is automatically injected within the `process` method\n of the parent task).\"\"\"\n return [\"evaluation\", \"score\", \"model_name\"]\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"This method formats the output generated by the LLM as a Python dictionary\n containing the `evaluation` which is the raw output generated by the LLM (consisting\n of the judge LLM alternate generation for the given instruction, plus an explanation\n on the evaluation of the given responses; plus the `score` extracted from the output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Is provided in case it needs to be used to enrich\n the output if needed.\n\n Returns:\n A dict with the keys `evaluation` with the raw output which contains the LLM\n evaluation and the extracted `score` if possible.\n \"\"\"\n if output is None:\n return {\"evaluation\": None, \"score\": None}\n pattern = re.compile(r\"\\[\\[([AB<>=]+)\\]\\]\")\n match = pattern.search(output)\n if match is None:\n return {\"evaluation\": output, \"score\": None}\n return {\"evaluation\": output, \"score\": match.group(1)}\n\n\nclass ArenaHardResults(GlobalStep):\n \"\"\"Process Arena Hard results to calculate the ELO scores.\n\n This `Step` is based on the \"From Live Data to High-Quality Benchmarks: The\n Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n instruction-tuned LLMs that contains 500 challenging user queries. This step is\n a `GlobalStep` that should run right after the `ArenaHard` task to calculate the\n ELO scores for the evaluated models.\n\n Note:\n Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n among popular open-ended LLM benchmarks.\n\n Input columns:\n - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n - score (`str`): The score extracted from the evaluation.\n\n References:\n - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n Examples:\n\n Rate the ELO scores for two assistant responses for a given an evaluation / comparison between both using Arean Hard prompts:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import GroupColumns, LoadDataFromDicts\n from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n with Pipeline() as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"What is the capital of France?\"}],\n )\n\n text_generation_a = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n text_generation_b = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine = GroupColumns(\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n llm=..., # LLM instance\n )\n\n arena_hard_results = ArenaHardResults(\n custom_model_column=\"generation_models\",\n custom_weights={\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3},\n )\n\n load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard >> arena_hard_results\n ```\n\n \"\"\"\n\n custom_model_column: Optional[str] = None\n custom_weights: Dict[str, int] = {\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3}\n\n def load(self) -> None:\n \"\"\"Ensures that the required dependencies are installed.\"\"\"\n super().load()\n\n try:\n import numpy as np # noqa: F401\n import pandas as pd # noqa: F401\n from sklearn.linear_model import LogisticRegression # noqa: F401\n except ImportError as e:\n raise ImportError(\n \"In order to run `ArenaHardResults`, the `arena-hard` extra dependencies\"\n \" must be installed i.e. `numpy`, `pandas`, and `scikit-learn`.\\n\"\n \"Please install the dependencies by running `pip install distilabel[arena-hard]`.\"\n ) from e\n\n # TODO: the `evaluation` is not really required as an input, so it could be removed, since\n # only `score` is used / required\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs required by this step are the `evaluation` and the `score` generated\n by the `ArenaHard` task. Since this step does use the identifiers `model_a` and `model_b`,\n optionally one can set `custom_model_column` to use the model names if existing within\n the input data, ideally this value should be `model_name` if connected from the `ArenaHard`\n step.\"\"\"\n columns = [\"evaluation\", \"score\"]\n if self.custom_model_column:\n columns.append(self.custom_model_column)\n return columns\n\n @override\n def process(self, inputs: StepInput) -> StepOutput: # type: ignore\n \"\"\"This method processes the inputs generated by the `ArenaHard` task to calculate the\n win rates for each of the models to evaluate. Since this step inherits from the `GlobalStep`,\n it will wait for all the input batches to be processed, and then the output will be yielded in\n case there's a follow up step, since this step won't modify the received inputs.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n\n References:\n - https://github.com/lm-sys/arena-hard-auto/blob/main/show_result.py\n \"\"\"\n import numpy as np\n import pandas as pd\n from sklearn.linear_model import LogisticRegression\n\n models = [\"A\", \"B\"]\n if self.custom_model_column:\n models = inputs[0][self.custom_model_column]\n\n # TODO: the battles are only calculated for the first game, even though the official\n # implementation also covers the possibility of a second game (not within the released\n # dataset yet)\n battles = pd.DataFrame()\n for input in inputs:\n output = {\n # TODO: \"question_id\": input[\"question_id\"],\n \"model_a\": models[0],\n \"model_b\": models[1],\n }\n if input[\"score\"] in [\"A>B\", \"A>>B\"]:\n output[\"winner\"] = models[0]\n rows = [output] * self.custom_weights[input[\"score\"]]\n elif input[\"score\"] in [\"B>A\", \"B>>A\"]:\n output[\"winner\"] = models[1]\n rows = [output] * self.custom_weights[input[\"score\"]]\n elif input[\"score\"] == \"A=B\":\n output[\"winner\"] = \"tie\"\n rows = [output]\n else:\n continue\n\n battles = pd.concat([battles, pd.DataFrame(rows)])\n\n models = pd.concat([battles[\"model_a\"], battles[\"model_b\"]]).unique()\n models = pd.Series(np.arange(len(models)), index=models)\n\n battles = pd.concat([battles, battles], ignore_index=True)\n p = len(models.index)\n n = battles.shape[0]\n\n X = np.zeros([n, p])\n X[np.arange(n), models[battles[\"model_a\"]]] = +np.log(10)\n X[np.arange(n), models[battles[\"model_b\"]]] = -np.log(10)\n\n Y = np.zeros(n)\n Y[battles[\"winner\"] == \"model_a\"] = 1.0\n\n tie_idx = battles[\"winner\"] == \"tie\"\n tie_idx[len(tie_idx) // 2 :] = False\n Y[tie_idx] = 1.0\n\n lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8) # type: ignore\n lr.fit(X, Y)\n\n # The ELO scores are calculated assuming that the reference is `gpt-4-0314`\n # with an starting ELO of 1000, so that the evaluated models are compared with\n # `gtp-4-0314` only if it's available within the models\n elo_scores = 400 * lr.coef_[0] + 1000\n # TODO: we could parametrize the reference / anchor model, but left as is to be faithful to the\n # original implementation\n if \"gpt-4-0314\" in models.index:\n elo_scores += 1000 - elo_scores[models[\"gpt-4-0314\"]]\n\n output = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)\n self._logger.info(f\"Arena Hard ELO: {output}\")\n\n # Here only so that if follow up steps are connected the inputs are preserved,\n # since this step doesn't modify nor generate new inputs\n yield inputs\n\n\nif __name__ == \"__main__\":\n import json\n\n from distilabel.models import InferenceEndpointsLLM, OpenAILLM\n from distilabel.pipeline import Pipeline\n from distilabel.steps import (\n GroupColumns,\n KeepColumns,\n LoadDataFromHub,\n StepInput,\n step,\n )\n from distilabel.steps.tasks import TextGeneration\n from distilabel.steps.typing import StepOutput\n\n @step(inputs=[\"turns\"], outputs=[\"system_prompt\", \"instruction\"])\n def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:\n for input in inputs:\n for item in input:\n item[\"system_prompt\"] = \"You are a helpful assistant.\"\n item[\"instruction\"] = item[\"turns\"][0][\"content\"]\n yield input\n\n @step(\n inputs=[\"question_id\"],\n outputs=[\"generation\", \"generation_model\"],\n step_type=\"global\",\n )\n def LoadReference(*inputs: StepInput) -> StepOutput:\n # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl\n lines = open(\"gpt-4-0314.jsonl\", mode=\"r\").readlines()\n for input in inputs:\n for item in input:\n for line in lines:\n data = json.loads(line)\n if data[\"question_id\"] == item[\"question_id\"]:\n item[\"generation\"] = data[\"choices\"][0][\"turns\"][0][\"content\"]\n item[\"generation_model\"] = data[\"model_id\"]\n break\n yield input\n\n with Pipeline(name=\"arena-hard-v0.1\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n repo_id=\"alvarobartt/lmsys-arena-hard-v0.1\",\n split=\"test\",\n num_examples=5,\n )\n\n load_reference = LoadReference(name=\"load_reference\")\n\n prepare = PrepareForTextGeneration(name=\"prepare\")\n\n text_generation_cohere = TextGeneration(\n name=\"text_generation_cohere\",\n llm=InferenceEndpointsLLM(\n model_id=\"CohereForAI/c4ai-command-r-plus\",\n tokenizer_id=\"CohereForAI/c4ai-command-r-plus\",\n ),\n use_system_prompt=True,\n input_batch_size=10,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine_columns = GroupColumns(\n name=\"combine_columns\",\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n name=\"arena_hard\",\n llm=OpenAILLM(model=\"gpt-4-1106-preview\"),\n output_mappings={\"model_name\": \"evaluation_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"question_id\",\n \"category\",\n \"cluster\",\n \"system_prompt\",\n \"instruction\",\n \"generations\",\n \"generation_models\",\n \"evaluation\",\n \"score\",\n \"evaluation_model\",\n ],\n )\n\n win_rates = ArenaHardResults(\n name=\"win_rates\", custom_model_column=\"generation_models\"\n )\n\n load_dataset >> load_reference # type: ignore\n load_dataset >> prepare >> text_generation_cohere # type: ignore\n ( # type: ignore\n [load_reference, text_generation_cohere]\n >> combine_columns\n >> arena_hard\n >> keep_columns\n >> win_rates\n )\n\n distiset = pipeline.run(\n parameters={ # type: ignore\n text_generation_cohere.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n \"stop_sequences\": [\"<EOS_TOKEN>\", \"<|END_OF_TURN_TOKEN|>\"],\n }\n }\n },\n arena_hard.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.0,\n \"max_new_tokens\": 4096,\n }\n }\n },\n },\n )\n if distiset is not None:\n distiset.push_to_hub(\"arena-hard-results\")\n "},{"location":"sections/pipeline_samples/examples/exam_questions/","title":"Create exam questions using structured generation","text":"This example will showcase how to generate exams questions and answers from a text page. In this case, we will use a wikipedia page as an example, and show how to leverage the prompt to help the model generate the data in the appropriate format. We are going to use a meta-llama/Meta-Llama-3.1-8B-Instruct to generate questions and answers for a mock exam from a wikipedia page. In this case, we are going to use the Transfer Learning entry for it. With the help of structured generation we will guide the model to create structured data for us that is easy to parse. The structure will be question, answer, and distractors (wrong answers). Click to see the sample results Example page Transfer_learning: QA of the page: {\n \"exam\": [\n {\n \"answer\": \"A technique in machine learning where knowledge learned from a task is re-used to boost performance on a related task.\",\n \"distractors\": [\"A type of neural network architecture\", \"A machine learning algorithm for image classification\", \"A method for data preprocessing\"],\n \"question\": \"What is transfer learning?\"\n },\n {\n \"answer\": \"1976\",\n \"distractors\": [\"1981\", \"1992\", \"1998\"],\n \"question\": \"In which year did Bozinovski and Fulgosi publish a paper addressing transfer learning in neural network training?\"\n },\n {\n \"answer\": \"Discriminability-based transfer (DBT) algorithm\",\n \"distractors\": [\"Multi-task learning\", \"Learning to Learn\", \"Cost-sensitive machine learning\"],\n \"question\": \"What algorithm was formulated by Lorien Pratt in 1992?\"\n },\n {\n \"answer\": \"A domain consists of a feature space and a marginal probability distribution.\",\n \"distractors\": [\"A domain consists of a label space and an objective predictive function.\", \"A domain consists of a task and a learning algorithm.\", \"A domain consists of a dataset and a model.\"],\n \"question\": \"What is the definition of a domain in the context of transfer learning?\"\n },\n {\n \"answer\": \"Transfer learning aims to help improve the learning of the target predictive function in the target domain using the knowledge in the source domain and learning task.\",\n \"distractors\": [\"Transfer learning aims to learn a new task from scratch.\", \"Transfer learning aims to improve the learning of the source predictive function in the source domain.\", \"Transfer learning aims to improve the learning of the target predictive function in the source domain.\"],\n \"question\": \"What is the goal of transfer learning?\"\n },\n {\n \"answer\": \"Markov logic networks, Bayesian networks, cancer subtype discovery, building utilization, general game playing, text classification, digit recognition, medical imaging, and spam filtering.\",\n \"distractors\": [\"Supervised learning, unsupervised learning, reinforcement learning, natural language processing, computer vision, and robotics.\", \"Image classification, object detection, segmentation, and tracking.\", \"Speech recognition, sentiment analysis, and topic modeling.\"],\n \"question\": \"What are some applications of transfer learning?\"\n },\n {\n \"answer\": \"ADAPT (Python), TLib (Python), Domain-Adaptation-Toolbox (Matlab)\",\n \"distractors\": [\"TensorFlow, PyTorch, Keras\", \"Scikit-learn, OpenCV, NumPy\", \"Matlab, R, Julia\"],\n \"question\": \"What are some software implementations of transfer learning and domain adaptation algorithms?\"\n }\n ]\n}\n "},{"location":"sections/pipeline_samples/examples/exam_questions/#build-the-pipeline","title":"Build the pipeline","text":"Let's see how to build a pipeline to obtain this type of data: from typing import List\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nimport wikipedia\n\npage = wikipedia.page(title=\"Transfer_learning\") # (1)\n\n\nclass ExamQuestion(BaseModel):\n question: str = Field(..., description=\"The question to be answered\")\n answer: str = Field(..., description=\"The correct answer to the question\")\n distractors: List[str] = Field(\n ..., description=\"A list of incorrect but viable answers to the question\"\n )\n\nclass ExamQuestions(BaseModel): # (2)\n exam: List[ExamQuestion]\n\n\nSYSTEM_PROMPT = \"\"\"\\\nYou are an exam writer specialized in writing exams for students.\nYour goal is to create questions and answers based on the document provided, and a list of distractors, that are incorrect but viable answers to the question.\nYour answer must adhere to the following format:\n```\n[\n {\n \"question\": \"Your question\",\n \"answer\": \"The correct answer to the question\",\n \"distractors\": [\"wrong answer 1\", \"wrong answer 2\", \"wrong answer 3\"]\n },\n ... (more questions and answers as required)\n]\n```\n\"\"\".strip() #\u00a0(3)\n\n\nwith Pipeline(name=\"ExamGenerator\") as pipeline:\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"page\": page.content, #\u00a0(4)\n }\n ],\n )\n\n text_generation = TextGeneration( # (5)\n name=\"exam_generation\",\n system_prompt=SYSTEM_PROMPT,\n template=\"Generate a list of answers and questions about the document. Document:\\n\\n{{ page }}\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\n \"schema\": ExamQuestions.model_json_schema(),\n \"format\": \"json\"\n },\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n load_dataset >> text_generation # (6)\n -
Download a single page for the demo. We could donwnload first the pages, or apply the same procedure to any type of data we want. In a real world use case, we would want to make a dataset from these documents first. -
Define the structure required for the answer using Pydantic. In this case we want for each page, a list with questions and answers (additionally we've added distractors, but can be ignored for this case). So our output will be a ExamQuestions model, which is a list of ExamQuestion , where each one consists in the question and answer fields as string fields. The language model will use the field descriptions to generate the values. -
Use the system prompt to guide the model towards the behaviour we want from it. Independently from the structured output we are forcing the model to have, it helps if we pass the format expected in our prompt. -
Move the page content from wikipedia to a row in the dataset. -
The TextGeneration task gets the system prompt, and the user prompt by means of the template argument, where we aid the model to generate the questions and answers based on the page content, that will be obtained from the corresponding column of the loaded data. -
Connect both steps, and we are done. "},{"location":"sections/pipeline_samples/examples/exam_questions/#run-the-example","title":"Run the example","text":"To run this example you will first need to install the wikipedia dependency to download the sample data, being pip install wikipedia . Change the username first in case you want to push the dataset to the hub using your account. Run python examples/exam_questions.py\n exam_questions.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport wikipedia\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\npage = wikipedia.page(title=\"Transfer_learning\")\n\n\nclass ExamQuestion(BaseModel):\n question: str = Field(..., description=\"The question to be answered\")\n answer: str = Field(..., description=\"The correct answer to the question\")\n distractors: List[str] = Field(\n ..., description=\"A list of incorrect but viable answers to the question\"\n )\n\n\nclass ExamQuestions(BaseModel):\n exam: List[ExamQuestion]\n\n\nSYSTEM_PROMPT = \"\"\"\\\nYou are an exam writer specialized in writing exams for students.\nYour goal is to create questions and answers based on the document provided, and a list of distractors, that are incorrect but viable answers to the question.\nYour answer must adhere to the following format:\n```\n[\n {\n \"question\": \"Your question\",\n \"answer\": \"The correct answer to the question\",\n \"distractors\": [\"wrong answer 1\", \"wrong answer 2\", \"wrong answer 3\"]\n },\n ... (more questions and answers as required)\n]\n```\n\"\"\".strip()\n\n\nwith Pipeline(name=\"ExamGenerator\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"page\": page.content,\n }\n ],\n )\n\n text_generation = TextGeneration(\n name=\"exam_generation\",\n system_prompt=SYSTEM_PROMPT,\n template=\"Generate a list of answers and questions about the document. Document:\\n\\n{{ page }}\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\n \"schema\": ExamQuestions.model_json_schema(),\n \"format\": \"json\",\n },\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 2048,\n }\n }\n }\n },\n use_cache=False,\n )\n distiset.push_to_hub(\"USERNAME/exam_questions\")\n "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/","title":"Create a social network with FinePersonas","text":"In this example, we'll explore the creation of specialized user personas for social network interactions using the FinePersonas-v0.1 dataset from Hugging Face. The final dataset will be ready to fine-tune a chat model with specific traits and characteristics. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#introduction","title":"Introduction","text":"We'll delve into the process of fine-tuning different LoRA (Low-Rank Adaptation) models to imbue these personas with specific traits and characteristics. This approach draws inspiration from Michael Sayman's work on SocialAI (visit the profile to see some examples), to leverage FinePersonas-v0.1 for building models that can emulate bots with specific behaviour. By fine-tuning these adapters, we can potentially create AI personas with distinct characteristics, communication styles, and areas of expertise. The result? AI interactions that feel more natural and tailored to specific contexts or user needs. For those interested in the technical aspects of this approach, we recommend the insightful blog post on Multi-LoRA serving. It provides a clear and comprehensive explanation of the technology behind this innovative method. Let's jump to the demo. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#creating-our-socialai-task","title":"Creating our SocialAI Task","text":"Building on the new TextGeneration , creating custom tasks is easier than ever before. This powerful tool opens up a world of possibilities for creating tailored text-based content with ease and precision. We will create a SocialAI task that will be in charge of generating responses to user interactions, taking into account a given follower_type , and use the perspective from a given persona : from distilabel.steps.tasks import TextGeneration\n\nclass SocialAI(TextGeneration):\n follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n system_prompt: str = (\n \"You are an AI assistant expert at simulating user interactions. \"\n \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n \"Here are some traits to use for your personality:\\n\\n\"\n \"{traits}\"\n ) #\u00a0(1)\n template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\" # (2)\n columns: str | list[str] = [\"persona\", \"post\"] # (3)\n\n _follower_traits: dict[str, str] = {\n \"supporter\": (\n \"- Encouraging and positive\\n\"\n \"- Tends to prioritize enjoyment and relaxation\\n\"\n \"- Focuses on the present moment and short-term pleasure\\n\"\n \"- Often uses humor and playful language\\n\"\n \"- Wants to help others feel good and have fun\\n\"\n ),\n \"troll\": (\n \"- Provocative and confrontational\\n\"\n \"- Enjoys stirring up controversy and conflict\\n\"\n \"- Often uses sarcasm, irony, and mocking language\\n\"\n \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n \"- Seeks to get a rise out of others and create drama\\n\"\n ),\n \"alarmist\": (\n \"- Anxious and warning-oriented\\n\"\n \"- Focuses on potential risks and negative consequences\\n\"\n \"- Often uses dramatic or sensational language\\n\"\n \"- Tends to be serious and stern in tone\\n\"\n \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n ),\n }\n\n def load(self) -> None:\n super().load()\n self.system_prompt = self.system_prompt.format(\n follower_type=self.follower_type,\n traits=self._follower_traits[self.follower_type]\n ) # (4)\n -
We have a custom system prompt that will depend on the follower_type we decide for our model. -
The base template or prompt will answert to the post we have, from the point of view of a persona . -
We will need our dataset to have both persona and post columns to populate the prompt. -
In the load method we place the specific traits for our follower type in the system prompt. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#data-preparation","title":"Data preparation","text":"This is an example, so let's keep it short. We will use 3 posts, and 3 different types of personas. While there's potential to enhance this process (perhaps by implementing random persona selection or leveraging semantic similarity) we'll opt for a straightforward method in this demonstration. Our goal is to create a set of nine examples, each pairing a post with a persona. To achieve this, we'll employ an LLM to respond to each post from the perspective of a specific persona , effectively simulating how different characters might engage with the content. posts = [\n {\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n },\n {\n \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n },\n {\n \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n },\n]\n\npersonas = (\n load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n .shuffle()\n .select(range(3))\n .select_columns(\"persona\")\n .to_list()\n)\n\ndata = []\nfor post in posts:\n for persona in personas:\n data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n Each row in will have the following format: import json\nprint(json.dumps(data[0], indent=4))\n{\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n \"persona\": \"A high school or college environmental science teacher or an ecology student specializing in biogeography and ecosystem dynamics.\"\n}\n This will be our dataset, that we can ingest using the LoadDataFromDicts : loader = LoadDataFromDicts(data=data)\n "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#simulating-from-different-types-of-followers","title":"Simulating from different types of followers","text":"With our data in hand, we're ready to explore the capabilities of our SocialAI task. For this demonstration, we'll make use of of meta-llama/Meta-Llama-3.1-70B-Instruct While this model has become something of a go-to choice recently, it's worth noting that experimenting with a variety of models could yield even more interesting results: from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n)\nfollower_type = \"supporter\"\n\nfollower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\",\n)\n This setup simplifies the process, we only need to input the follower type, and the system handles the rest. We could update this too to have a random type of follower by default, and simulate from a bunch of different personalities. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#building-our-pipeline","title":"Building our Pipeline","text":"The foundation of our pipeline is now in place. At its core is a single, powerful LLM. This versatile model will be repurposed to drive three distinct SocialAI Tasks, each tailored to a specific TextGeneration task, and each one of them will be prepared for Supervised Fine Tuning using FormatTextGenerationSFT : with Pipeline(name=\"Social AI Personas\") as pipeline:\n loader = LoadDataFromDicts(data=data, batch_size=1)\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n )\n\n for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n follower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\", # (1)\n output_mappings={\n \"generation\": f\"interaction_{follower_type}\" # (2)\n }\n )\n format_sft = FormatTextGenerationSFT(\n name=f\"format_sft_{follower_type}\",\n input_mappings={\n \"instruction\": \"post\",\n \"generation\": f\"interaction_{follower_type}\" # (3)\n },\n )\n loader >> follower >> format_sft # (4)\n -
We update the name of the step to keep track in the pipeline. -
The generation column from each LLM will be mapped to avoid them being overriden, as we are reusing the same task. -
As we have modified the output column from SocialAI , we redirect each one of the \"follower_type\" responses. -
Connect the loader to each one of the follower tasks and format_sft to obtain 3 different subsets. The outcome of this pipeline will be three specialized models, each fine-tuned to a unique follower type crafted by the SocialAI task. These models will generate SFT-formatted datasets, where each post is paired with its corresponding interaction data for a specific follower type. This setup enables seamless fine-tuning using your preferred framework, such as TRL, or any other training framework of your choice. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#script-and-final-dataset","title":"Script and final dataset","text":"All the pieces are in place for our script, the full pipeline can be seen here: Run python examples/finepersonas_social_ai.py\n finepersonas_social_ai.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Literal\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import FormatTextGenerationSFT, LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass SocialAI(TextGeneration):\n follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n system_prompt: str = (\n \"You are an AI assistant expert at simulating user interactions. \"\n \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n \"Here are some traits to use for your personality:\\n\\n\"\n \"{traits}\"\n )\n template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"\n columns: str | list[str] = [\"persona\", \"post\"]\n\n _follower_traits: dict[str, str] = {\n \"supporter\": (\n \"- Encouraging and positive\\n\"\n \"- Tends to prioritize enjoyment and relaxation\\n\"\n \"- Focuses on the present moment and short-term pleasure\\n\"\n \"- Often uses humor and playful language\\n\"\n \"- Wants to help others feel good and have fun\\n\"\n ),\n \"troll\": (\n \"- Provocative and confrontational\\n\"\n \"- Enjoys stirring up controversy and conflict\\n\"\n \"- Often uses sarcasm, irony, and mocking language\\n\"\n \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n \"- Seeks to get a rise out of others and create drama\\n\"\n ),\n \"alarmist\": (\n \"- Anxious and warning-oriented\\n\"\n \"- Focuses on potential risks and negative consequences\\n\"\n \"- Often uses dramatic or sensational language\\n\"\n \"- Tends to be serious and stern in tone\\n\"\n \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n ),\n }\n\n def load(self) -> None:\n super().load()\n self.system_prompt = self.system_prompt.format(\n follower_type=self.follower_type,\n traits=self._follower_traits[self.follower_type],\n )\n\n\nposts = [\n {\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n },\n {\n \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n },\n {\n \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n },\n]\n\npersonas = (\n load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n .shuffle()\n .select(range(3))\n .select_columns(\"persona\")\n .to_list()\n)\n\ndata = []\nfor post in posts:\n for persona in personas:\n data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n\n\nwith Pipeline(name=\"Social AI Personas\") as pipeline:\n loader = LoadDataFromDicts(data=data, batch_size=1)\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n )\n\n for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n follower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\",\n output_mappings={\"generation\": f\"interaction_{follower_type}\"},\n )\n format_sft = FormatTextGenerationSFT(\n name=f\"format_sft_{follower_type}\",\n input_mappings={\n \"instruction\": \"post\",\n \"generation\": f\"interaction_{follower_type}\",\n },\n )\n loader >> follower >> format_sft\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n distiset.push_to_hub(\"plaguss/FinePersonas-SocialAI-test\", include_script=True)\n This is the final toy dataset we obtain: FinePersonas-SocialAI-test You can see examples of how to load each subset of them to fine-tune a model: from datasets import load_dataset\n\nds = load_dataset(\"plaguss/FinePersonas-SocialAI-test\", \"format_sft_troll\")\n And a sample of the generated field with the corresponding post and persona : {\n \"post\": \"Hmm, ok now I\\u0027m torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n \"persona\": \"A high school or undergraduate physics or chemistry teacher, likely with a focus on experimental instruction.\",\n \"interaction_troll\": \"\\\"Late night cravings? More like late night brain drain. Either way, it\\u0027s just a collision of molecules in your stomach. Choose the one with more calories, at least that\\u0027s some decent kinetic energy.\\\"\",\n}\n There's a lot of room for improvement, but quite a promising start. "},{"location":"sections/pipeline_samples/examples/llama_cpp_with_outlines/","title":"Structured generation with outlines ","text":"Generate RPG characters following a pydantic.BaseModel with outlines in distilabel . This script makes use of LlamaCppLLM and the structured output capabilities thanks to outlines to generate RPG characters that adhere to a JSON schema. It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other LLMs like vLLM . Run python examples/structured_generation_with_outlines.py\n structured_generation_with_outlines.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom enum import Enum\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, StringConstraints, conint\nfrom typing_extensions import Annotated\n\nfrom distilabel.models import LlamaCppLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Weapon(str, Enum):\n sword = \"sword\"\n axe = \"axe\"\n mace = \"mace\"\n spear = \"spear\"\n bow = \"bow\"\n crossbow = \"crossbow\"\n\n\nclass Armor(str, Enum):\n leather = \"leather\"\n chainmail = \"chainmail\"\n plate = \"plate\"\n mithril = \"mithril\"\n\n\nclass Character(BaseModel):\n name: Annotated[str, StringConstraints(max_length=30)]\n age: conint(gt=1, lt=3000)\n armor: Armor\n weapon: Weapon\n\n\n# Download the model with\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nwith Pipeline(\"RPG-characters\") as pipeline:\n system_prompt = (\n \"You are a leading role play gamer. You have seen thousands of different characters and their attributes.\"\n \" Please return a JSON object with common attributes of an RPG character.\"\n )\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"system_prompt\": system_prompt,\n \"instruction\": f\"Give me a character description for a {char}\",\n }\n for char in [\"dwarf\", \"elf\", \"human\", \"ork\"]\n ],\n )\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n # Change to vLLM as such:\n # llm = vLLM(\n # model=\"teknium/OpenHermes-2.5-Mistral-7B\",\n # extra_kwargs={\"tensor_parallel_size\": 1},\n # structured_output={\"format\": \"json\", \"schema\": Character},\n # )\n\n text_generation = TextGeneration(\n name=\"text_generation_rpg\",\n llm=llm,\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 256}}\n }\n },\n use_cache=False,\n )\n for num, character in enumerate(distiset[\"default\"][\"train\"][\"generation\"]):\n print(f\"Character: {num}\")\n print(character)\n\n# Character: 0\n# {\n# \"name\": \"Gimli\",\n# \"age\": 42,\n# \"armor\": \"plate\",\n# \"weapon\": \"axe\" }\n# Character: 1\n# {\"name\":\"Gaelen\",\"age\":600,\"armor\":\"leather\",\"weapon\":\"bow\"}\n# Character: 2\n# {\"name\": \"John Smith\",\"age\": 35,\"armor\": \"leather\",\"weapon\": \"sword\"}\n# Character: 3\n# { \"name\": \"Grug\", \"age\": 35, \"armor\": \"leather\", \"weapon\": \"axe\"}\n "},{"location":"sections/pipeline_samples/examples/mistralai_with_instructor/","title":"Structured generation with instructor ","text":"Answer instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel . This script makes use of MistralLLM and the structured output capabilities thanks to instructor to generate knowledge graphs from complex topics. This example is translated from this awesome example from instructor cookbook. Run python examples/structured_generation_with_instructor.py\n structured_generation_with_instructor.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.models import MistralLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Node(BaseModel):\n id: int\n label: str\n color: str\n\n\nclass Edge(BaseModel):\n source: int\n target: int\n label: str\n color: str = \"black\"\n\n\nclass KnowledgeGraph(BaseModel):\n nodes: List[Node] = Field(..., default_factory=list)\n edges: List[Edge] = Field(..., default_factory=list)\n\n\nwith Pipeline(\n name=\"Knowledge-Graphs\",\n description=(\n \"Generate knowledge graphs to answer questions, this type of dataset can be used to \"\n \"steer a model to answer questions with a knowledge graph.\"\n ),\n) as pipeline:\n sample_questions = [\n \"Teach me about quantum mechanics\",\n \"Who is who in The Simpsons family?\",\n \"Tell me about the evolution of programming languages\",\n ]\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"system_prompt\": \"You are a knowledge graph expert generator. Help me understand by describing everything as a detailed knowledge graph.\",\n \"instruction\": f\"{question}\",\n }\n for question in sample_questions\n ],\n )\n\n text_generation = TextGeneration(\n name=\"knowledge_graph_generation\",\n llm=MistralLLM(\n model=\"open-mixtral-8x22b\", structured_output={\"schema\": KnowledgeGraph}\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 2048}}\n }\n },\n use_cache=False,\n )\n\n distiset.push_to_hub(\"distilabel-internal-testing/knowledge_graphs\")\n Visualizing the graphs Want to see how to visualize the graphs? You can test it using the following script. Generate some samples on your own and take a look: Note This example uses graphviz to render the graph, you can install with pip in the following way: pip install graphviz\n python examples/draw_kg.py 2 # You can pass 0,1,2 to visualize each of the samples.\n "},{"location":"sections/pipeline_samples/examples/text_generation_with_image/","title":"Text generation with images in distilabel ","text":"Answer questions about images using distilabel . Image-text-to-text models take in an image and text prompt and output text. In this example we will use an LLM InferenceEndpointsLLM with meta-llama/Llama-3.2-11B-Vision-Instruct to ask a question about an image, and OpenAILLM with gpt-4o-mini . We will ask a simple question to showcase how the TextGenerationWithImage task can be used in a pipeline. Inference Endpoints - meta-llama/Llama-3.2-11B-Vision-InstructOpenAI - gpt-4o-mini from distilabel.models.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\nfrom distilabel.steps import LoadDataFromDicts\n\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ],\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n )\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=llm,\n image_type=\"url\" # (1)\n )\n\n loader >> vision\n - The image_type can be a url pointing to the image, the base64 string representation, or a PIL image, take a look at the
TextGenerationWithImage for more information. Image: Question: What\u2019s in this image? Response: This image depicts a wooden boardwalk weaving its way through a lush meadow, flanked by vibrant green grass that stretches towards the horizon under a calm and inviting sky. The boardwalk runs straight ahead, away from the viewer, forming a clear pathway through the tall, lush green grass, crops or other plant types or an assortment of small trees and shrubs. This meadow is dotted with trees and shrubs, appearing to be healthy and green. The sky above is a beautiful blue with white clouds scattered throughout, adding a sense of tranquility to the scene. While this image appears to be of a natural landscape, because grass is... from distilabel.models.llms import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\nfrom distilabel.steps import LoadDataFromDicts\n\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ],\n )\n\n llm = OpenAILLM(\n model=\"gpt-4o-mini\",\n )\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=llm,\n image_type=\"url\" # (1)\n )\n\n loader >> vision\n - The image_type can be a url pointing to the image, the base64 string representation, or a PIL image, take a look at the
VisionGeneration for more information. Image: Question: What\u2019s in this image? Response: The image depicts a scenic landscape featuring a wooden walkway or path that runs through a lush green marsh or field. The area is surrounded by tall grass and various shrubs, with trees likely visible in the background. The sky is blue with some wispy clouds, suggesting a beautiful day. Overall, it presents a peaceful natural setting, ideal for a stroll or nature observation. The full pipeline can be run at the following example: Run the full pipeline python examples/text_generation_with_image.py\n text_generation_with_image.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom distilabel.models.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n }\n ],\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n )\n\n vision = TextGenerationWithImage(name=\"vision_gen\", llm=llm, image_type=\"url\")\n\n loader >> vision\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n distiset.push_to_hub(\"plaguss/test-vision-generation-Llama-3.2-11B-Vision-Instruct\")\n A sample dataset can be seen at plaguss/test-vision-generation-Llama-3.2-11B-Vision-Instruct. "},{"location":"sections/pipeline_samples/papers/apigen/","title":"Create Function-Calling datasets with APIGen","text":"This example will introduce APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets, a data generation pipeline designed to synthesize verifiable high-quality datasets for function-calling applications. "},{"location":"sections/pipeline_samples/papers/apigen/#replication","title":"Replication","text":"The following figure showcases the APIGen framework: Now, let's walk through the key steps illustrated in the figure: -
DataSampler : With the help of this step and the original Salesforce/xlam-function-calling-60k we are getting the Seed QA Data Sampler for the prompt template. -
APIGenGenerator : This step does the job of the Query-Answer Generator, including the format checker from Stage 1: Format Checker thanks to the structured output generation. -
APIGenExecutionChecker : This step is in charge of the Stage 2: Execution Checker. -
APIGenSemanticChecker : Step in charge of running Stage 3: Semantic Checker, can use the same or a different LLM, we are using the same as in APIGenGenerator step. The current implementation hasn't utilized the Diverse Prompt Library. To incorporate it, one could either adjust the prompt template within the APIGenGenerator or develop a new sampler specifically for this purpose. As for the API Sampler, while no specific data is shared here, we've created illustrative examples to demonstrate the pipeline's functionality. These examples represent a mix of data that could be used to replicate the sampler's output. "},{"location":"sections/pipeline_samples/papers/apigen/#data-preparation","title":"Data preparation","text":"The original paper tells about the data they used and give some hints, but nothing was shared. In this example, we will write a bunch of examples by hand to showcase how this pipeline can be built. Assume we have the following function names, and corresponding descriptions of their behaviour: data = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n {\n \"func_name\": \"binary_addition\",\n \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n },\n {\n \"func_name\": \"swapi_planet_resource\",\n \"func_desc\": \"get a specific planets resource\",\n },\n {\n \"func_name\": \"disney_character\",\n \"func_desc\": \"Find a specific character using this endpoint\",\n }\n]\n The original paper refers to both python functions and APIs, but we will make use of python functions exclusively for simplicity. In order to execute and check this functions/APIs, we need access to the code, which we have moved to a Python file: lib_apigen.py. All this functions are executable, but we also need access to their tool representation. For this, we will make use of transformers' get_json_schema function1. We have all the machinery prepared in our libpath, except from the tool definition. With the help of our helper function load_module_from_path we will load this python module, collect all the tools, and add them to each row in our data variable. from distilabel.steps.tasks.apigen.utils import load_module_from_path\n\nlibpath_module = load_module_from_path(libpath)\ntools = getattr(libpath_module, \"get_tools\")() # call get_tools()\n\nfor row in data:\n #\u00a0The tools should have a mix where both the correct and irrelevant tools are present.\n row.update({\"tools\": [tools[row[\"func_name\"]]]})\n Now we have all the necessary data for our prompt. Additionally, we will make use of the original dataset as few-shot examples to enhance the model: ds_og = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\n We have just loaded a subset and transformed it to a list of dictionaries, as we will use it in the DataSampler GeneratorStep , grabbing random examples from the original dataset. "},{"location":"sections/pipeline_samples/papers/apigen/#building-the-pipeline","title":"Building the Pipeline","text":"Now that we've walked through each component, it's time to see how it all comes together, here's the Pipeline code: with Pipeline(name=\"apigen-example\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data) # (1)\n\n sampler = DataSampler( # (2)\n data=ds_og,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n\n prep_examples = PrepareExamples() # This step will add the 'examples' column\n\n combine_steps = CombineOutputs() # (3)\n\n model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n llm=InferenceEndpointsLLM( # (4)\n model_id=model_id,\n tokenizer_id=model_id,\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n apigen = APIGenGenerator( # (5)\n llm=llm,\n use_default_structured_output=True,\n )\n\n execution_checker = APIGenExecutionChecker(libpath=str(libpath)) # (6)\n semantic_checker = APIGenSemanticChecker(llm=llm) # (7)\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples] \n >> combine_steps \n >> apigen\n >> execution_checker\n >> semantic_checker\n )\n -
Load the data seeds we are going to use to generate our function calling dataset. -
The DataSampler together with PrepareExamples will be used to help us create the few-shot examples from the original dataset to be fed in our prompt. -
Combine both columns to obtain a single stream of data -
Will reuse the same LLM for the generation and the semantic checks. -
Creates the query and answers that will be used together with the tools to fine-tune a new model. Will generate the structured outputs to ensure we have valid JSON formatted answers. -
Adds columns keep_row_after_execution_check and execution_result . -
Adds columns keep_row_after_semantic_check and thought . "},{"location":"sections/pipeline_samples/papers/apigen/#script-and-final-dataset","title":"Script and final dataset","text":"To see all the pieces in place, take a look at the full pipeline, as well as an example row that would be generated from this pipeline. Run python examples/pipeline_apigen.py\n pipeline_apigen.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom pathlib import Path\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, DataSampler, LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n APIGenExecutionChecker,\n APIGenGenerator,\n APIGenSemanticChecker,\n)\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples, load_module_from_path\n\nlibpath = Path(__file__).parent / \"lib_apigen.py\"\n\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n {\n \"func_name\": \"binary_addition\",\n \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n },\n {\n \"func_name\": \"swapi_planet_resource\",\n \"func_desc\": \"get a specific planets resource\",\n },\n {\n \"func_name\": \"disney_character\",\n \"func_desc\": \"Find a specific character using this endpoint\",\n },\n]\n\nlibpath_module = load_module_from_path(libpath)\ntools = libpath_module.get_tools() # call get_tools()\n\n# TODO: Add in the tools between 0 and 2 extra tools to make the task more challenging.\nfor row in data:\n # The tools should have a mix where both the correct and irrelevant tools are present.\n row.update({\"tools\": [tools[row[\"func_name\"]]]})\n\n\nds_og = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\n\n\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds_og,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n\n prep_examples = PrepareExamples()\n\n model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n llm = InferenceEndpointsLLM(\n model_id=model_id,\n tokenizer_id=model_id,\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n apigen = APIGenGenerator(\n llm=llm,\n use_default_structured_output=True,\n )\n combine_steps = CombineOutputs()\n\n execution_checker = APIGenExecutionChecker(libpath=str(libpath))\n semantic_checker = APIGenSemanticChecker(llm=llm)\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n >> apigen\n >> execution_checker\n >> semantic_checker\n )\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run()\n print(distiset[\"default\"][\"train\"][0])\n Example row: {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n \"tools\": [\n {\n \"function\": {\n \"description\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n \"name\": \"final_velocity\",\n \"parameters\": {\n \"properties\": {\n \"acceleration\": {\n \"description\": \"The acceleration of the object.\",\n \"type\": \"number\"\n },\n \"initial_velocity\": {\n \"description\": \"The initial velocity of the object.\",\n \"type\": \"number\"\n },\n \"time\": {\n \"description\": \"The time elapsed.\",\n \"type\": \"number\"\n }\n },\n \"required\": [\n \"initial_velocity\",\n \"acceleration\",\n \"time\"\n ],\n \"type\": \"object\"\n }\n },\n \"type\": \"function\"\n }\n ],\n \"examples\": \"## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\",\n \"query\": \"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\",\n \"answers\": \"[{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\",\n \"distilabel_metadata\": {\n \"raw_input_a_p_i_gen_generator_0\": [\n {\n \"content\": \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\",\n \"role\": \"system\"\n },\n {\n \"content\": \"Here are examples of queries and the corresponding answers for similar functions:\\n## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\n\\nBased on these examples, generate 1 diverse query and answer pairs for the function `final_velocity`.\\nThe detailed function description is the following:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n\\nThese are the available tools to help you:\\n[{'type': 'function', 'function': {'name': 'final_velocity', 'description': 'Calculates the final velocity of an object given its initial velocity, acceleration, and time.', 'parameters': {'type': 'object', 'properties': {'initial_velocity': {'type': 'number', 'description': 'The initial velocity of the object.'}, 'acceleration': {'type': 'number', 'description': 'The acceleration of the object.'}, 'time': {'type': 'number', 'description': 'The time elapsed.'}}, 'required': ['initial_velocity', 'acceleration', 'time']}}}]\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n {\\n \\\"query\\\": \\\"The generated query.\\\",\\n \\\"answers\\\": [\\n {\\n \\\"name\\\": \\\"api_name\\\",\\n \\\"arguments\\\": {\\n \\\"arg_name\\\": \\\"value\\\"\\n ... (more arguments as required)\\n }\\n },\\n ... (more API calls as required)\\n ]\\n }\\n]\\n```\\n\\nNow please generate 1 diverse query and answer pairs following the above format.\",\n \"role\": \"user\"\n }\n ],\n \"raw_input_a_p_i_gen_semantic_checker_0\": [\n {\n \"content\": \"As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\",\n \"role\": \"system\"\n },\n {\n \"content\": \"Given Information:\\n- All Available Functions:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n- User Query: What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\n- Generated Function Calls: [{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\\n- Execution Results: ['9.8']\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query's intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \\\"thought\\\": \\\"Concisely describe your reasoning here\\\",\\n \\\"passes\\\": \\\"yes\\\" or \\\"no\\\"\\n}\\n```\\n\",\n \"role\": \"user\"\n }\n ],\n \"raw_output_a_p_i_gen_generator_0\": \"{\\\"pairs\\\": [\\n {\\n \\\"answers\\\": [\\n {\\n \\\"arguments\\\": {\\n \\\"acceleration\\\": \\\"9.8\\\",\\n \\\"initial_velocity\\\": \\\"0\\\",\\n \\\"time\\\": \\\"10\\\"\\n },\\n \\\"name\\\": \\\"final_velocity\\\"\\n }\\n ],\\n \\\"query\\\": \\\"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\\"\\n }\\n]}\",\n \"raw_output_a_p_i_gen_semantic_checker_0\": \"{\\n \\\"thought\\\": \\\"\\\",\\n \\\"passes\\\": \\\"yes\\\"\\n}\"\n },\n \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n \"keep_row_after_execution_check\": true,\n \"execution_result\": [\n \"9.8\"\n ],\n \"thought\": \"\",\n \"keep_row_after_semantic_check\": true\n}\n -
Read this nice blog post for more information on tools and the reasoning behind get_json_schema : Tool Use, Unified.\u00a0\u21a9 "},{"location":"sections/pipeline_samples/papers/clair/","title":"Contrastive Learning From AI Revisions (CLAIR)","text":"\"Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment\" introduces both Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs, and Anchored Preference Optimization (APO), a controllable and more stable alignment objective. While APO can be found in TRL, we have implemented a task for CLAIR in distilabel . CLAIR is a method for creating preference pairs which minimally revises one output to express a preference, resulting in a more precise learning signal as opposed to conventional methods which use a judge to select a preferred response. The athors from the original paper shared a collection of datasets from CLAIR and APO, where ContextualAI/ultrafeedback_clair_32k corresponds to the CLAIR implementation. "},{"location":"sections/pipeline_samples/papers/clair/#replication","title":"Replication","text":"Note The section is named Replication but in this case we are showing how to use the CLAIR task create revisions for your generations using distilabel . To showcase CLAIR we will be using the CLAIR task implemented in distilabel and we are reusing a small sample of the already generated dataset by ContextualAI ContextualAI/ultrafeedback_clair_32k for testing. "},{"location":"sections/pipeline_samples/papers/clair/#installation","title":"Installation","text":"To reproduce the code below, one will need to install distilabel as follows: pip install \"distilabel>=1.4.0\"\n Depending on the LLM provider you want to use, the requirements may vary, take a look at the dependencies in that case, we are using for the example the free inference endpoints from Hugging Face, but that won't apply for a bigger dataset. "},{"location":"sections/pipeline_samples/papers/clair/#building-blocks","title":"Building blocks","text":"In this case where we already have instructions and their generations, we will just need to load the data and the corresponding CLAIR task for the revisions: CLAIR to generate the revisions. "},{"location":"sections/pipeline_samples/papers/clair/#code","title":"Code","text":"Let's see the full pipeline applied to ContextualAI/ultrafeedback_clair_32k in distilabel : from typing import Any, Dict\n\nfrom datasets import load_dataset\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\n\ndef transform_ultrafeedback(example: Dict[str, Any]) -> Dict[str, Any]:\n return {\n \"task\": example[\"prompt\"],\n \"student_solution\": example[\"rejected\"][1][\"content\"],\n }\n\ndataset = (\n load_dataset(\"ContextualAI/ultrafeedback_clair_32k\", split=\"train\")\n .select(range(10)) #\u00a0We collect just 10 examples\n .map(transform_ultrafeedback) # Apply the transformation to get just the text\n)\n\nwith Pipeline(name=\"CLAIR UltraFeedback sample\") as pipeline:\n clair = CLAIR( # (1)\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096\n }\n )\n )\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(dataset=dataset) # (2)\n distiset.push_to_hub(repo_id=\"username/clair-test\", include_script=True) # (3)\n -
This Pipeline uses just CLAIR because we already have the generations, but one can just include a first task to create generations from instructions, and then the revisions with CLAIR. -
Include the dataset directly in the run method for simplicity. -
Push the distiset to the hub with the script for reproducibility. An example dataset can be found at: distilabel-internal-testing/clair-test. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/","title":"DeepSeek Prover","text":"\"DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data\" presents an approach to generate mathematical proofs for theorems generated from informal math problems. This approach shows promising results to advance the capabilities of models towards theorem proving using synthetic data. Until this moment the dataset and the model trained on top of it haven't been opened, let's see how the approach works to reproduce the pipeline using distilabel . The following figure depicts the approach taken to generate the dataset: The authors propose a method for generating Lean 4 proof data from informal mathematical problems. Their approach translates high-school and undergraduate-level mathematical competition problems into formal statements. Here we show how to deal with steps 1 and 2, but the authors ensure the theorems are checked using the lean4 program on the generated proofs, and iterate for a series of steps, fine-tuning a model on the synthetic data (DeepSeek prover 7B), regenerating the dataset, and continue the process until no further improvement is found. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#replication","title":"Replication","text":"Note The section is named Replication but we will show how we can use distilabel to create the different steps outlined in the DeepSeek-Prover approach. We intentionally let some steps out of the pipeline, but this can easily be extended. We will define the components needed to generate a dataset like the one depicted in the previous figure (we won't call lean4 or do the fine-tuning, this last step can be done outside of distilabel ). The different blocks will have all the docstrings as we would have in the internal steps to showcase how they are done, but they can be omitted for brevity. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#installation","title":"Installation","text":"To reproduce the code below, we need to install distilabel as it follows: pip install \"distilabel[hf-inference-endpoints]\"\n We have decided to use InferenceEndpointsLLM , but any other provider with a strong model could work. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#building-blocks","title":"Building blocks","text":"There are three components we needed to define for this pipeline, for the different components in the paper: A task to formalize the original statements, another one to assess the relevance of the theorems, and a final one to generate proofs for the theorems. Note We will use the same LLM for all the tasks, so we will define once and reuse it for the different tasks: llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n)\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverautoformalization","title":"DeepSeekProverAutoFormalization","text":"This Task corresponds to the first step in the figure. Given an informal statement, it will formalize it for us in Lean 4 language, meaning it will translate from an informal statement that could be gathered from the internet, to the lean4 structured language. DeepSeekProverAutoFormalization _PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n examples: Optional[List[str]] = None\n system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n _template: Union[Template, None] = PrivateAttr(...)\n _few_shot: bool = PrivateAttr(default=False)\n\n def load(self) -> None:\n super().load()\n self._template = Template(template_deepseek_prover_auto_formalization)\n\n @property\n def inputs(self) -> List[str]:\n return [\"informal_statement\"]\n\n @property\n def outputs(self):\n return [\"formal_statement\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n informal_statement=input[self.inputs[0]],\n few_shot=bool(self.examples),\n examples=self.examples,\n ),\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"formal_statement\": match}\n Following the paper, they found that the model yields better results if it uses examples in a few shot setting, so this class allows to take some examples to help in generating the formulation. Let's see an example of how we can instantiate it: from textwrap import dedent\n\nexamples = [\n dedent(\"\"\"\n ## Statement in natural language:\n For real numbers k and x:\n If x is equal to (13 - \u221a131) / 4, and\n If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n Then k must be equal to 19/4.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n Given two integers x and y:\n If y is positive (greater than 0),\n And y is less than x,\n And the equation x + y + xy = 80 is true,\n Then x must be equal to 26.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\nauto_formalization = DeepSeekProverAutoFormalization(\n name=\"auto_formalization\",\n input_batch_size=8,\n llm=llm,\n examples=examples\n)\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverscorer","title":"DeepSeekProverScorer","text":"The next Task corresponds to the second step, the model scoring and assessment. It uses an LLM as judge to evaluate the relevance of the theorem, and assigns a score so it can be filtered afterwards. DeepSeekProverScorer template_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\nclass DeepSeekProverScorer(Task):\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n super().load()\n self._template = Template(template_deepseek_prover_scorer)\n\n @property\n def inputs(self) -> List[str]:\n return [\"informal_statement\", \"formal_statement\"]\n\n @property\n def outputs(self):\n return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType:\n return [\n {\n \"role\": \"system\",\n \"content\": self._template.render(),\n },\n {\n \"role\": \"user\",\n \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n },\n ]\n\n @override\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]:\n try:\n result = output.split(\"Natural language:\")[1].strip()\n natural_language, analysis = result.split(\"Analysis:\")\n analysis, assessment = analysis.split(\"Assessment:\")\n natural_language = natural_language.strip()\n analysis = analysis.strip()\n assessment = assessment.strip()\n except Exception:\n natural_language = analysis = assessment = None\n\n return {\n \"natural_language\": natural_language,\n \"analysis\": analysis,\n \"assessment\": assessment\n }\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproversolver","title":"DeepSeekProverSolver","text":"The last task is in charge of generating a proof for the theorems generated in the previous steps. DeepSeekProverSolver class DeepSeekProverSolver(Task):\n system_prompt: str = (\n \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"formal_statement\"]\n\n @property\n def outputs(self):\n return [\"proof\"]\n\n def format_input(self, input: str) -> ChatType:\n prompt = dedent(\"\"\"\n Give me a proof for the following theorem:\n ```lean4\n {theorem}\n ```\"\"\"\n )\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n },\n ]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]:\n import re\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"proof\": match}\n Additionally, the original pipeline defined in the paper includes a step to check the final proofs using the lean 4 language that we have omitted for simplicity. The fine tuning can be done completely offline, and come back to the pipeline after each iteration/training run. All the docstrings have been removed from the code blocks, but can be seen in the full pipeline. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#code","title":"Code","text":"Lets's put the building blocks together to create the final pipeline with distilabel . For this example we have generated a sample dataset plaguss/informal-mathematical-statements-tiny of informal mathematical statements starting from casey-martin/multilingual-mathematical-autoformalization, but as the paper mentions, we can create formal statements and it's corresponding proofs starting from informal ones: Click to see the full pipeline deepseek_prover.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom pathlib import Path\nfrom textwrap import dedent\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom jinja2 import Template\nfrom pydantic import PrivateAttr\nfrom typing_extensions import override\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\n\n_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n \"\"\"Task to translate a mathematical problem from natural language to Lean 4.\n\n Note:\n A related dataset (MMA from the paper) can be found in Hugging Face:\n [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n Input columns:\n - informal_statement (`str`): The statement to be formalized using Lean 4.\n\n Output columns:\n - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n Categories:\n - generation\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n - [`Lean 4`](https://github.com/leanprover/lean4).\n\n Examples:\n\n Formalize a mathematical problem from natural language to Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n prover_autoformal = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n )\n\n prover_autoformal.load()\n\n result = next(\n prover_autoformal.process(\n [\n {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n # },\n # 'model_name': 'deepseek-prover'\n # }\n # ]\n ```\n\n Use a few-shot setting to formalize a mathematical problem from natural language to Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n from distilabel.models import InferenceEndpointsLLM\n\n # You can gain inspiration from the following examples to create your own few-shot examples:\n # https://github.com/yangky11/miniF2F-lean4/blob/main/MiniF2F/Valid.lean\n # Consider this as a placeholder for your actual LLM.\n prover_autoformal = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n examples=[\n \"theorem amc12a_2019_p21 (z : \u2102) (h\u2080 : z = (1 + Complex.I) / Real.sqrt 2) :\\n\\n((\u2211 k : \u2124 in Finset.Icc 1 12, z ^ k ^ 2) * (\u2211 k : \u2124 in Finset.Icc 1 12, 1 / z ^ k ^ 2)) = 36 := by\\n\\nsorry\",\n \"theorem amc12a_2015_p10 (x y : \u2124) (h\u2080 : 0 < y) (h\u2081 : y < x) (h\u2082 : x + y + x * y = 80) : x = 26 := by\\n\\nsorry\"\n ]\n )\n\n prover_autoformal.load()\n\n result = next(\n prover_autoformal.process(\n [\n {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n # },\n # 'model_name': 'deepseek-prover'\n # }\n # ]\n ```\n \"\"\"\n\n examples: Optional[List[str]] = None\n system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n _template: Union[Template, None] = PrivateAttr(...)\n _few_shot: bool = PrivateAttr(default=False)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n self._template = Template(template_deepseek_prover_auto_formalization)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"informal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"formal_statement\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n informal_statement=input[self.inputs[0]],\n few_shot=bool(self.examples),\n examples=self.examples,\n ),\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"Extracts the formal statement from the Lean 4 output.\"\"\"\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"formal_statement\": match}\n\n\ntemplate_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\n\nclass DeepSeekProverScorer(Task):\n \"\"\"Task to evaluate the quality of a formalized mathematical problem in Lean 4,\n inspired by the DeepSeek-Prover task for scoring.\n\n Note:\n A related dataset (MMA from the paper) can be found in Hugging Face:\n [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n Input columns:\n - informal_statement (`str`): The statement to be formalized using Lean 4.\n - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n Output columns:\n - natural_language (`str`): Explanation for the problem.\n - analysis (`str`): Analysis of the different points defined in the prompt.\n - assessment (`str`): Result of the assessment.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n - [`Lean 4`](https://github.com/leanprover/lean4).\n\n Examples:\n\n Analyse a formal statement in Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n prover_scorer = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n )\n\n prover_scorer.load()\n\n result = next(\n prover_scorer.process(\n [\n {\"formal_statement\": \"theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'informal_statement': 'INFORMAL',\n # 'analysis': 'ANALYSIS',\n # 'assessment': 'ASSESSMENT',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_scorer_0': 'Natural language:\\nINFORMAL\\nAnalysis:\\nANALYSIS\\nAssessment:\\nASSESSMENT'\n # },\n # 'model_name': 'deepseek-prover-scorer'\n # }\n # ]\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n self._template = Template(template_deepseek_prover_scorer)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"informal_statement\", \"formal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._template.render(),\n },\n {\n \"role\": \"user\",\n \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"Analyses the formal statement with Lean 4 output and generates an assessment\n and the corresponding informal assessment.\"\"\"\n\n try:\n result = output.split(\"Natural language:\")[1].strip()\n natural_language, analysis = result.split(\"Analysis:\")\n analysis, assessment = analysis.split(\"Assessment:\")\n natural_language = natural_language.strip()\n analysis = analysis.strip()\n assessment = assessment.strip()\n except Exception:\n natural_language = analysis = assessment = None\n\n return {\n \"natural_language\": natural_language,\n \"analysis\": analysis,\n \"assessment\": assessment,\n }\n\n\nclass DeepSeekProverSolver(Task):\n \"\"\"Task to generate a proof for a formal statement (theorem) in lean4.\n\n Input columns:\n - formal_statement (`str`): The formalized statement using Lean 4.\n\n Output columns:\n - proof (`str`): The proof for the formal statement theorem.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n \"\"\"\n\n system_prompt: str = (\n \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `formal_statement`.\"\"\"\n return [\"formal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is the proof for the formal statement theorem.\"\"\"\n return [\"proof\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType`, with a system prompt to guide our model.\"\"\"\n prompt = dedent(\"\"\"\n Give me a proof for the following theorem:\n ```lean4\n {theorem}\n ```\"\"\")\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n },\n ]\n\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n import re\n\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"proof\": match}\n\n\nexamples = [\n dedent(\"\"\"\n ## Statement in natural language:\n For real numbers k and x:\n If x is equal to (13 - \u221a131) / 4, and\n If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n Then k must be equal to 19/4.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n Given two integers x and y:\n If y is positive (greater than 0),\n And y is less than x,\n And the equation x + y + xy = 80 is true,\n Then x must be equal to 26.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\n\nwith Pipeline(name=\"test_deepseek_prover\") as pipeline:\n data_loader = LoadDataFromHub(\n repo_id=\"plaguss/informal-mathematical-statements-tiny\",\n split=\"val\",\n batch_size=8,\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n )\n auto_formalization = DeepSeekProverAutoFormalization(\n name=\"auto_formalization\", input_batch_size=8, llm=llm, examples=examples\n )\n prover_scorer = DeepSeekProverScorer(\n name=\"prover_scorer\",\n input_batch_size=8,\n llm=llm,\n )\n proof_generator = DeepSeekProverSolver(\n name=\"proof_generator\", input_batch_size=8, llm=llm\n )\n\n (data_loader >> auto_formalization >> prover_scorer >> proof_generator)\n\n\nif __name__ == \"__main__\":\n import argparse\n\n parser = argparse.ArgumentParser()\n parser.add_argument(\n \"-d\",\n \"--dry-run\",\n action=argparse.BooleanOptionalAction,\n help=\"Do a dry run for testing purposes.\",\n )\n args = parser.parse_args()\n\n pipeline_parameters = {\n data_loader.name: {\"split\": \"val\"},\n auto_formalization.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.6,\n \"top_p\": 0.9,\n \"max_new_tokens\": 512,\n }\n }\n },\n prover_scorer.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.6,\n \"top_p\": 0.9,\n \"max_new_tokens\": 512,\n }\n }\n },\n }\n\n ds_name = \"test_deepseek_prover\"\n\n if args.dry_run:\n distiset = pipeline.dry_run(batch_size=1, parameters=pipeline_parameters)\n distiset.save_to_disk(Path.home() / f\"Downloads/{ds_name}\")\n\n import pprint\n\n pprint.pprint(distiset[\"default\"][\"train\"][0])\n\n else:\n distiset = pipeline.run(parameters=pipeline_parameters)\n distiset.push_to_hub(ds_name, include_script=True)\n The script can be run run for a dry run or not, depending on the argument (the pipeline will run without dry run by default), and will be pushed to the hub with the name your_username/test_deepseek_prover : python deepseek_prover.py [-d | --dry-run | --no-dry-run]\n Final dataset: plaguss/test_deepseek_prover. "},{"location":"sections/pipeline_samples/papers/deita/","title":"DEITA","text":"DEITA (Data-Efficient Instruction Tuning for Alignment) studies an automatic data selection process by first quantifying the data quality based on complexity, quality and diversity. Second, select the best potential combination from an open-source dataset that would fit into the budget you allocate to tune your own LLM. In most setting we cannot allocate unlimited resources for instruction-tuning LLMs. Therefore, the DEITA authors investigated how to select qualitative data for instruction tuning based on the principle of fewer high-quality samples. Liu et al. tackle the issue of first defining good data and second identifying it to respect an initial budget to instruct-tune your LLM. The strategy utilizes LLMs to replace human effort in time-intensive data quality tasks on instruction-tuning datasets**. DEITA introduces a way to measure data quality across three critical dimensions: complexity, quality and diversity. You can see that we see again the dataset of instructions/responses and we kind of reproducing the second step when we learn how to optimize the responses according to an instruction by comparing several possibilities. "},{"location":"sections/pipeline_samples/papers/deita/#datasets-and-budget","title":"Datasets and budget","text":"We will dive deeper into the whole process. We will investigate each stage to efficiently select the final dataset used for supervised fine-tuning with a budget constraint. We will tackle technical challenges by explaining exactly how you would assess good data as presented in the paper. As a reminder, we're looking for a strategy to automatically select good data for the instruction-tuning step when you want to fine-tune an LLM to your own use case taking into account a resource constraint. This means that you cannot blindly train a model on any data you encounter on the internet. The DEITA authors assume that you have access to open-source datasets that fit your use case. This may not be the case entirely. But with open-source communities tackling many use cases, with projects such as BLOOM or AYA, it's likely that your use case will be tackled at some point. Furthermore, you could generate your own instruction/response pairs with methods such as self-generated instructions using distilabel. This tutorial assumes that we have a data pool with excessive samples for the project's cost constraint. In short, we aim to achieve adequate performance from fewer samples. The authors claim that the subsample size \"correlates proportionally with the computation consumed in instruction tuning\". Hence on a first approximation, reducing the sample size means reducing computation consumption and so the total development cost. Reproducing the paper notations, we will associate the budget m to a number of instruction/response pairs that you can set depending on your real budget. To match the experimental set-up, dataset X_sota is a meta-dataset combining major open-source datasets available to instruct-tune LLMs. This dataset is composed of ShareGPT (58k instruction/response pairs), UltraChat (105k instruction/response pairs) and WizardLM (143k instruction/response pairs). It sums to more than 300k instruction/response pairs. We aim to reduce the final subsample to 6k instruction/response pairs. "},{"location":"sections/pipeline_samples/papers/deita/#setup-the-notebook-and-packages","title":"Setup the notebook and packages","text":"Let's prepare our dependencies: pip install \"distilabel[openai,hf-transformers]>=1.0.0\"\npip install pynvml huggingface_hub argilla\n Import distilabel: from distilabel.models import TransformersLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import ConversationTemplate, DeitaFiltering, ExpandColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import ComplexityScorer, EvolInstruct, EvolQuality, GenerateEmbeddings, QualityScorer\n Define the distilabel Pipeline and load the dataset from the Hugging Face Hub. pipeline = Pipeline(name=\"DEITA\")\n\nload_data = LoadDataFromHub(\n name=\"load_data\", batch_size=100, output_mappings={\"prompt\": \"instruction\"}, pipeline=pipeline\n)\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-instruct-generate-instructions-with-an-llm","title":"EVOL-INSTRUCT: Generate Instructions with an LLM","text":"Evol-Instruct automates the creation of complex instruction data for training large language models (LLMs) by progressively rewriting an initial set of instructions into more complex forms. This generated data is then used to fine-tune a model named WizardLM. Evaluations show that instructions from Evol-Instruct are superior to human-created ones, and WizardLM achieves performance close to or exceeding GPT3.5-turbo in many skills. In distilabel, we initialise each step of the data generation pipeline. Later, we'll connect them together. evol_instruction_complexity = EvolInstruct(\n name=\"evol_instruction_complexity\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n num_evolutions=5,\n store_evolutions=True,\n generate_answers=True,\n include_original_instruction=True,\n pipeline=pipeline,\n)\n\nevol_instruction_complexity.load()\n\n_evolved_instructions = next(evol_instruction_complexity.process(\n ([{\"instruction\": \"How many fish are there in a dozen fish?\"}]))\n)\n\nprint(*_evolved_instructions, sep=\"\\n\")\n Output: ( 1, 'How many fish are there in a dozen fish?')\n( 2, 'How many rainbow trout are there in a dozen rainbow trout?')\n( 3, 'What is the average weight in pounds of a dozen rainbow trout caught in a specific river in Alaska during the month of May?')\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-complexity-evaluate-complexity-of-generated-instructions","title":"EVOL COMPLEXITY: Evaluate complexity of generated instructions","text":"The second step is the evaluation of complexity for an instruction in a given instruction-response pair. Like EVOL-INSTRUCT, this method uses LLMs instead of humans to automatically improve instructions, specifically through their complexity. From any instruction-response pair, \\((I, R)\\), we first generate new instructions following the In-Depth Evolving Response. We generate more complex instructions through prompting, as explained by authors, by adding some constraints or reasoning steps. Let\\'s take an example from GPT-4-LLM which aims to generate observations by GPT-4 to instruct-tune LLMs with supervised fine-tuning. And, we have the instruction \\(instruction_0\\): instruction_0 = \"Give three tips for staying healthy.\"\n To make it more complex, you can use, as the authors did, some prompt templates to add constraints or deepen the instruction. They provided some prompts in the paper appendix. For instance, this one was used to add constraints: PROMPT = \"\"\"I want you act as a Prompt Rewriter.\nYour objective is to rewrite a given prompt into a more complex version to\nmake those famous AI systems (e.g., ChatGPT and GPT4) a bit harder to handle.\nBut the rewritten prompt must be reasonable and must be understood and\nresponded by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt#:. Also, please do not omit the input in #Given Prompt#.\nYou SHOULD complicate the given prompt using the following method:\nPlease add one more constraints/requirements into #Given Prompt#\nYou should try your best not to make the #Rewritten Prompt# become verbose,\n#Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.\n\u2018#Given Prompt#\u2019, \u2018#Rewritten Prompt#\u2019, \u2018given prompt\u2019 and \u2018rewritten prompt\u2019\nare not allowed to appear in #Rewritten Prompt#\n#Given Prompt#:\n<Here is instruction>\n#Rewritten Prompt#:\n\"\"\"\n Prompting this to an LLM, you automatically get a more complex instruction, called \\(instruction_1\\), from an initial instruction \\(instruction_0\\): instruction_1 = \"Provide three recommendations for maintaining well-being, ensuring one focuses on mental health.\"\n With sequences of evolved instructions, we use a further LLM to automatically rank and score them. We provide the 6 instructions at the same time. By providing all instructions together, we force the scoring model to look at minor complexity differences between evolved instructions. Encouraging the model to discriminate between instructions. Taking the example below, \\(instruction_0\\) and \\(instruction_1\\) could deserve the same score independently, but when compared together we would notice the slight difference that makes \\(instruction_1\\) more complex. In distilabel , we implement this like so: instruction_complexity_scorer = ComplexityScorer(\n name=\"instruction_complexity_scorer\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n input_mappings={\"instructions\": \"evolved_instructions\"},\n pipeline=pipeline,\n)\n\nexpand_evolved_instructions = ExpandColumns(\n name=\"expand_evolved_instructions\",\n columns=[\"evolved_instructions\", \"answers\", \"scores\"],\n output_mappings={\n \"evolved_instructions\": \"evolved_instruction\",\n \"answers\": \"answer\",\n \"scores\": \"evol_instruction_score\",\n },\n pipeline=pipeline,\n)\n\ninstruction_complexity_scorer.load()\n\n_evolved_instructions = next(instruction_complexity_scorer.process(([{\"evolved_instructions\": [PROMPT + instruction_1]}])))\n\nprint(\"Original Instruction:\")\nprint(instruction_1)\nprint(\"\\nEvolved Instruction:\")\nprint(_evolved_instructions[0][\"evolved_instructions\"][0].split(\"#Rewritten Prompt#:\\n\")[1])\n Output: Original Instruction:\nProvide three recommendations for maintaining well-being, ensuring one focuses on mental health.\n\nEvolved Instruction:\nSuggest three strategies for nurturing overall well-being, with the stipulation that at least one explicitly addresses the enhancement of mental health, incorporating evidence-based practices.\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-quality-quality-evaluation","title":"EVOL-QUALITY: Quality Evaluation","text":"Now that we have scored the complexity of the instructions, we will focus on the quality of the responses. Similar to EVOL COMPLEXITY, the authors introduced EVOL QUALITY, a method based on LLMs, instead of humans, to automatically score the quality of the response. From an instruction-response pair, \\((I, R)\\), the goal is to make the response evolve into a more helpful and relevant response. The key difference is that we need to also provide the first instruction to guide evolution. Let's take back our example from GPT-4-LLM. Here we have the response \\(response_0\\) and its initial instruction \\(instruction_0\\): instruction_0 = \"Give three tips for staying healthy.\"\nreponse_0 = \"1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\"\n Again the authors provided several prompts you could use to make your response evolve according to some guidelines. For example, this one was used to enrich the answer: PROMPT = \"\"\"I want you to act as a Response Rewriter\nYour goal is to enhance the quality of the response given by an AI assistant\nto the #Given Prompt# through rewriting.\nBut the rewritten response must be reasonable and must be understood by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt# and #Given Response#. Also, please do not omit the input\nin #Given Prompt#.\nYou Should enhance the quality of the response using the following method:\nPlease make the Response more in-depth\nYou should try your best not to make the #Rewritten Response# become verbose,\n#Rewritten Response# can only add 10 to 20 words into #Given Response#.\n\u2018#Given Response#\u2019, \u2018#Rewritten Response#\u2019, \u2018given response\u2019 and \u2018rewritten response\u2019\nare not allowed to appear in #Rewritten Response#\n#Given Prompt#:\n<instruction_0>\n#Given Response#:\n<response_0>\n#Rewritten Response#:\n\"\"\"\n Prompting this to an LLM, you will automatically get a more enriched response, called \\(response_1\\), from an initial response \\(response_0\\) and initial instruction \\(instruction_0\\): evol_response_quality = EvolQuality(\n name=\"evol_response_quality\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n num_evolutions=5,\n store_evolutions=True,\n include_original_response=True,\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"response\": \"answer\",\n },\n pipeline=pipeline,\n)\n\nevol_response_quality.load()\n\n_evolved_responses = next(evol_response_quality.process([{\"instruction\": PROMPT + instruction_0, \"response\": reponse_0}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\nprint(\"\\nEvolved Response:\")\nprint(*_evolved_responses[0]['evolved_responses'], sep=\"\\n\")\n And now, as in EVOL COMPLEXITY you iterate through this path and use different prompts to make your responses more relevant, helpful or creative. In the paper, they make 4 more iterations to get 5 evolved responses \\((R0, R1, R2, R3, R4)\\) which makes 5 different responses for one initial instruction at the end of this step. response_quality_scorer = QualityScorer(\n name=\"response_quality_scorer\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"responses\": \"evolved_responses\",\n },\n pipeline=pipeline,\n)\n\nexpand_evolved_responses = ExpandColumns(\n name=\"expand_evolved_responses\",\n columns=[\"evolved_responses\", \"scores\"],\n output_mappings={\n \"evolved_responses\": \"evolved_response\",\n \"scores\": \"evol_response_score\",\n },\n pipeline=pipeline,\n)\n\nresponse_quality_scorer.load()\n\n_scored_responses = next(response_quality_scorer.process([{\"instruction\": PROMPT + instruction_0, \"responses\": _evolved_responses[0]['evolved_responses']}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\n\nprint(\"\\nScore, Evolved Response:\")\nprint(*zip(_scored_responses[0][\"scores\"], _evolved_responses[0]['evolved_responses']), sep=\"\\n\")\n Output: Original Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n\nScore, Evolved Response:\n(4.0, 'Here are three essential tips for maintaining good health: \\n1. Prioritize regular exercise \\n2. Eat a balanced diet with plenty of fruits and vegetables \\n3. Get an adequate amount of sleep each night.')\n(2.0, 'Here are three effective strategies to maintain a healthy lifestyle.')\n(5.0, 'Here are three practical tips to maintain good health: Ensure a balanced diet, engage in regular exercise, and prioritize sufficient sleep. These practices support overall well-being.')\n "},{"location":"sections/pipeline_samples/papers/deita/#improving-data-diversity","title":"Improving Data Diversity","text":"One main component of good data to instruct-tune LLMs is diversity. Real world data can often contain redundancy due repetitive and homogeneous data. The authors of the DEITA paper tackle the challenge of ensuring data diversity in the instruction tuning LLMs to avoid the pitfalls of data redundancy that can lead to over-fitting or poor generalization. They propose an embedding-based method to filter data for diversity. This method, called Repr Filter, uses embeddings generated by the Llama 1 13B model to represent instruction-response pairs in a vector space. The diversity of a new data sample is assessed based on the cosine distance between its embedding and that of its nearest neighbor in the already selected dataset. If this distance is greater than a specified threshold, the sample is considered diverse and is added to the selection. This process prioritizes diversity by assessing each sample's contribution to the variety of the dataset until the data selection budget is met. This approach effectively maintains the diversity of the data used for instruction tuning, as demonstrated by the DEITA models outperforming or matching state-of-the-art models with significantly less training data. In this implementation of DEITA we use the hidden state of the last layer of the Llama 2 model to generate embeddings, instead of a sentence transformer model, because we found that it improved the diversity of the data selection. generate_conversation = ConversationTemplate(\n name=\"generate_conversation\",\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"response\": \"evolved_response\",\n },\n pipeline=pipeline,\n)\n\ngenerate_embeddings = GenerateEmbeddings(\n name=\"generate_embeddings\",\n llm=TransformersLLM(\n model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n device=\"cuda\",\n torch_dtype=\"float16\",\n ),\n input_mappings={\"text\": \"conversation\"},\n input_batch_size=5,\n pipeline=pipeline,\n)\n\ndeita_filtering = DeitaFiltering(name=\"deita_filtering\", pipeline=pipeline)\n "},{"location":"sections/pipeline_samples/papers/deita/#build-the-distilabel-pipeline","title":"Build the \u2697 distilabel Pipeline ","text":"Now we're ready to build a distilabel pipeline using the DEITA method: load_data.connect(evol_instruction_complexity)\nevol_instruction_complexity.connect(instruction_complexity_scorer)\ninstruction_complexity_scorer.connect(expand_evolved_instructions)\nexpand_evolved_instructions.connect(evol_response_quality)\nevol_response_quality.connect(response_quality_scorer)\nresponse_quality_scorer.connect(expand_evolved_responses)\nexpand_evolved_responses.connect(generate_conversation)\ngenerate_conversation.connect(generate_embeddings)\ngenerate_embeddings.connect(deita_filtering)\n Now we can run the pipeline. We use the step names to reference them in the pipeline configuration: distiset = pipeline.run(\n parameters={\n \"load_data\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-50\",\n \"split\": \"train\",\n },\n \"evol_instruction_complexity\": {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n },\n \"instruction_complexity_scorer\": {\n \"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}\n },\n \"evol_response_quality\": {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n },\n \"response_quality_scorer\": {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}},\n \"deita_filtering\": {\"data_budget\": 500, \"diversity_threshold\": 0.04},\n },\n use_cache=False,\n)\n We can push the results to the Hugging Face Hub: distiset.push_to_hub(\"distilabel-internal-testing/deita-colab\")\n "},{"location":"sections/pipeline_samples/papers/deita/#results","title":"Results","text":"Again, to show the relevance of EVOL QUALITY method, the authors evaluated on the MT-bench models fine-tuned with different data selections according to how we defined quality responses according to an instruction. Each time they selected 6k data according to the quality score: Credit: Liu et al. (2023) The score is much better when selecting data with the EVOL QUALITY method than when we select randomly or according to the length, making a more qualitative response if longer. Nevertheless, we see that the margin we may have seen in the complexity score is thinner. And we'll discuss the strategy in a later part. Nevertheless, this strategy looks to improve the fine-tuning compared to the baselines and now we're interested in mixing quality and complexity assessment with a diversity evaluation to find the right trade-off in our selection process. "},{"location":"sections/pipeline_samples/papers/deita/#conclusion","title":"Conclusion","text":"In conclusion, if you are looking for some efficient method to align an open-source LLM to your business case with a constrained budget, the solutions provided by DEITA are really worth the shot. This data-centric approach enables one to focus on the content of the dataset to have the best results instead of \"just\" scaling the instruction-tuning with more, and surely less qualitative, data. In a nutshell, the strategy developed, through automatically scoring instructions-responses, aims to substitute the human preference step proprietary models such as GPT-4 have been trained with. There are a few improvements we could think about when it comes to how to select the good data, but it opens a really great way in instruct-tuning LLM with lower computational needs making the whole process intellectually relevant and more sustainable than most of the other methods. We'd be happy to help you out with aligning an LLM with your business case drawing inspiration from such a methodology. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/","title":"Instruction Backtranslation","text":"\"Self Alignment with Instruction Backtranslation\" presents a scalable method to build high-quality instruction following a language model by automatically labeling human-written text with corresponding instructions. Their approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high-quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model. Their self-training approach assumes access to a base language model, a small amount of seed data, and a collection of unlabelled examples, e.g. a web corpus. The unlabelled data is a large, diverse set of human-written documents that includes writing about all manner of topics humans are interested in \u2013 but crucially is not paired with instructions. A first key assumption is that there exists some subset of this very large human-written text that would be suitable as gold generations for some user instructions. A second key assumption is that they can predict instructions for these candidate gold answers that can be used as high-quality example pairs to train an instruction-following model. Their overall process, called instruction back translation performs two core steps: -
Self-augment: Generate instructions for unlabelled data, i.e. the web corpus, to produce candidate training data of (instruction, output) pairs for instruction tuning. -
Self-curate: Self-select high-quality demonstration examples as training data to finetune the base model to follow instructions. This approach is done iteratively where a better intermediate instruction-following model can improve on selecting data for finetuning in the next iteration. This replication covers the self-curation step i.e. the second/latter step as mentioned above, so as to be able to use the proposed prompting approach to rate the quality of the generated text, which can either be synthetically generated or real human-written text. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#replication","title":"Replication","text":"To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#installation","title":"Installation","text":"To replicate Self Alignment with Instruction Backtranslation one will need to install distilabel as it follows: pip install \"distilabel[hf-inference-endpoints,openai]>=1.0.0\"\n And since we will be using InferenceEndpointsLLM (installed via the extra hf-inference-endpoints ) we will need deploy those in advance either locally or in the Hugging Face Hub (alternatively also the serverless endpoints can be used, but most of the times the inference times are slower, and there's a limited quota to use those as those are free) and set both the HF_TOKEN (to use the InferenceEndpointsLLM ) and the OPENAI_API_KEY environment variable value (to use the OpenAILLM ). "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#building-blocks","title":"Building blocks","text":" LoadDataFromHub : Generator Step to load a dataset from the Hugging Face Hub. TextGeneration : Task to generate responses for a given instruction using an LLM. InferenceEndpointsLLM : LLM that runs a model from an Inference Endpoint in the Hugging Face Hub. InstructionBacktranslation : Task that generates a score and a reason for a response for a given instruction using the Self Alignment with Instruction Backtranslation prompt. OpenAILLM : LLM that loads a model from OpenAI. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to replicate Self Alignment with Instruction Backtranslation. from distilabel.models import InferenceEndpointsLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub, KeepColumns\nfrom distilabel.steps.tasks import InstructionBacktranslation, TextGeneration\n\n\nwith Pipeline(name=\"self-alignment-with-instruction-backtranslation\") as pipeline:\n load_hub_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=InferenceEndpointsLLM(\n base_url=\"<INFERENCE_ENDPOINT_URL>\",\n tokenizer_id=\"argilla/notus-7b-v1\",\n model_display_name=\"argilla/notus-7b-v1\",\n ),\n input_batch_size=10,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n instruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"instruction\",\n \"generation\",\n \"generation_model\",\n \"score\",\n \"reason\",\n \"scoring_model\",\n ],\n )\n\n load_hub_dataset >> text_generation >> instruction_backtranslation >> keep_columns\n Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n load_hub_dataset.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n instruction_backtranslation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n },\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"instruction-backtranslation-instruction-dataset\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/papers/math_shepherd/","title":"Create datasets to train a Process Reward Model using Math-Shepherd","text":"This example will introduce Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations, an innovative math process reward model (PRM) which assigns reward scores to each step of math problem solutions. Specifically, we will present a recipe to create datasets to train such models. The final sections contain 2 pipeline examples to run the pipeline depending with more or less resources. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#replica","title":"Replica","text":"Unlike traditional models that only look at final answers (Output Reward Models or ORM), this system evaluates each step of a mathematical solution and assigns reward scores to individual solution steps. Let's see the Figure 2 from the paper, which makes a summary of the labelling approach presented in their work. In the traditional ORM approach, the annotation was done depending on the final outcome, while the Process Reward Model (PRM) allows labelling the different steps that lead to a solution, making for a richer set of information. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#steps-involved","title":"Steps involved","text":" -
MathShepherdGenerator : This step is in charge of generating solutions for the instruction. Depending on the value set for the M , this step can be used to generate both the golden_solution , to be used as a reference for the labeller, or the set of solutions to be labelled. For the solutions column we want some diversity, to allow the model to reach both good and bad solutions, so we have a representative sample for the labeller, so it may be better to use a \"weaker\" model. -
MathShepherdCompleter . This task does the job of the completer in the paper, generating completions as presented in Figure 2, section 3.3.2. It doesn't generate a column on it's own, but updates the steps generated in the solutions column from the MathShepherdGenerator , using as reference to label the data, the golden_solution . So in order for this step to work, we need both of this columns in our dataset. Depending on the type of dataset, we may already have access to the golden_solution , even if it's with a different name, but it's not the same for the solutions . -
FormatPRM . This step does the auxiliary job of preparing the data to follow the format defined in the paper of having two columns input and label . After running the MathShepherdCompleter , we have raw data that can be formatted as the user want. Using ExpandColumns and this step, one can directly obtain the same format presented in the dataset shared in the paper: peiyi9979/Math-Shepherd. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#data-preparation","title":"Data preparation","text":"For this example, just as the original paper, we are using the openai/gsm8k dataset. We only need a dataset with instructions to be solved (in this case it corresponds to the question column), and we can generate everything else using our predefined steps. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#building-the-pipeline","title":"Building the pipeline","text":"The pipeline uses openai/gsm8k as reference, but the pipeline can be applied to different datasets, keep in mind the prompts can be modified with the current definition, by tweaking the extra_rules and few_shots in each task: from datasets import load_dataset\n\nfrom distilabel.steps.tasks import MathShepherdCompleter, MathShepherdGenerator, FormatPRM\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\n\nds_name = \"openai/gsm8k\"\n\nds = load_dataset(ds_name, \"main\", split=\"test\").rename_column(\"question\", \"instruction\").select(range(3)) # (1)\n\nwith Pipeline(name=\"Math-Shepherd\") as pipe:\n model_id_70B = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n model_id_8B = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n\n llm_70B = InferenceEndpointsLLM(\n model_id=model_id_70B,\n tokenizer_id=model_id_70B,\n generation_kwargs={\"max_new_tokens\": 1024, \"temperature\": 0.6},\n )\n llm_8B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 2048, \"temperature\": 0.6},\n ) # (2)\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_70B,\n ) # (3)\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_8B,\n use_default_structured_output=True, # (9)\n M=5\n ) #\u00a0(4)\n completer = MathShepherdCompleter(\n name=\"completer\",\n llm=llm_8B,\n use_default_structured_output=True,\n N=4\n ) # (5)\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n ) #\u00a0(6)\n formatter = FormatPRM(name=\"format_prm\") # (7)\n\n [generator_golden, generator] >> combine >> completer >> expand >> formatter # (8)\n -
Will use just 3 rows from the sample dataset, and rename the \"question\" to \"instruction\", to set the expected value for the MathShepherdGenerator . -
We will use 2 different LLMs, meta-llama/Meta-Llama-3.1-70B-Instruct (a stronger model for the golden_solution ) and meta-llama/Meta-Llama-3.1-8B-Instruct (a weaker one to generate candidate solutions, and the completions). -
This MathShepherdGenerator task, that uses the stronger model, will generate the golden_solution for us, the step by step solution for the task. -
Another MathShepherdGenerator task, but in this case using the weaker model will generate candidate solutions (M=5 in total). -
Now the MathShepherdCompleter task will generate n=4 completions for each step of each candidate solution in the solutions column, and label them using the golden_solution as shown in Figure 2 in the paper. This step will add the label (it uses [+ and -] tags following the implementation in the paper, but these values can be modified) to the solutions column in place, instead of generating an additional column, but the intermediate completions won't be shown at the end. -
The ExpandColumns step expands the solution to match the instruction, so if we had set M=5, we would now have 5x instruction-pair solutions. We set the split_statistics to True to ensure the distilabel_metadata is split accordingly, othwerwise the number of tokens for each solution would count as the tokens needed for the whole list of solutions generated. One can omit both this and the following step and process the data for training as preferred. -
And finally, the FormatPRM generates two columns: input and label which prepare the data for training as presented in the original Math-Shepherd dataset. -
Both the generator_golden and generator can be run in parallel as there's no dependency between them, and after that we combine the results and pass them to the completer . Finally, we use the expand and formatter prepare the data in the expected format to train the Process Reward Model as defined in the original paper. -
Generate structured outputs to ensure it's easier to parse them, otherwise the models can fail a lot of times with an easy to parse list. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#script-and-final-dataset","title":"Script and final dataset","text":"To see all the pieces in place, take a look at the full pipeline: Run python examples/pipe_math_shepherd.py\n Full pipeline pipe_math_shepherd.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\nfrom distilabel.steps.tasks import (\n FormatPRM,\n MathShepherdCompleter,\n MathShepherdGenerator,\n)\n\nds_name = \"openai/gsm8k\"\n\nds = (\n load_dataset(ds_name, \"main\", split=\"test\")\n .rename_column(\"question\", \"instruction\")\n .select(range(3))\n)\n\n\nwith Pipeline(name=\"Math-Shepherd\") as pipe:\n model_id_70B = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n model_id_8B = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n\n llm_70B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 1024, \"temperature\": 0.5},\n )\n llm_8B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 2048, \"temperature\": 0.7},\n )\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_70B,\n )\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_8B,\n M=5,\n )\n completer = MathShepherdCompleter(name=\"completer\", llm=llm_8B, N=4)\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n )\n formatter = FormatPRM(name=\"format_prm\")\n [generator_golden, generator] >> combine >> completer >> expand >> formatter\n\n\nif __name__ == \"__main__\":\n distiset = pipe.run(use_cache=False, dataset=ds)\n distiset.push_to_hub(\"plaguss/test_math_shepherd_prm\")\n The resulting dataset can be seen at: plaguss/test_math_shepherd_prm. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#pipeline-with-vllm-and-ray","title":"Pipeline with vLLM and ray","text":"This section contains an alternative way of running the pipeline with a bigger outcome. To showcase how to scale the pipeline, we are using for the 3 generating tasks Qwen/Qwen2.5-72B-Instruct, highly improving the final quality as it follows much closer the prompt given. Also, we are using vLLM and 3 nodes (one per task in this case), to scale up the generation process. Math-Shepherd's bigger pipeline from datasets import load_dataset\n\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\nfrom distilabel.steps.tasks import (\n FormatPRM,\n MathShepherdCompleter,\n MathShepherdGenerator,\n)\n\nds_name = \"openai/gsm8k\"\n\nds = (\n load_dataset(ds_name, \"main\", split=\"test\")\n .rename_column(\"question\", \"instruction\")\n)\n\n\nwith Pipeline(name=\"Math-Shepherd\").ray() as pipe: # (1)\n\n model_id_72B = \"Qwen/Qwen2.5-72B-Instruct\"\n\n llm_72B = vLLM(\n model=model_id_72B,\n tokenizer=model_id_72B,\n extra_kwargs={\n \"tensor_parallel_size\": 8, # Number of GPUs per node\n \"max_model_len\": 2048,\n },\n generation_kwargs={\n \"temperature\": 0.5,\n \"max_new_tokens\": 4096,\n },\n )\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_72B,\n input_batch_size=50,\n output_mappings={\"model_name\": \"model_name_golden_generator\"},\n resources=StepResources(replicas=1, gpus=8) # (2)\n )\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_72B,\n input_batch_size=50,\n M=5,\n use_default_structured_output=True,\n output_mappings={\"model_name\": \"model_name_generator\"},\n resources=StepResources(replicas=1, gpus=8)\n )\n completer = MathShepherdCompleter(\n name=\"completer\", \n llm=llm_72B,\n N=8,\n use_default_structured_output=True,\n output_mappings={\"model_name\": \"model_name_completer\"},\n resources=StepResources(replicas=1, gpus=8)\n )\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n\n )\n formatter = FormatPRM(name=\"format_prm\", format=\"trl\") # (3)\n\n [generator_golden, generator] >> combine >> completer >> expand >> formatter\n\n\nif __name__ == \"__main__\":\n distiset = pipe.run(use_cache=False, dataset=ds, dataset_batch_size=50)\n if distiset:\n distiset.push_to_hub(\"plaguss/test_math_shepherd_prm_ray\")\n -
Transform the pipeline to run using ray backend. -
Assign the resources: number of replicas 1 as we want a single instance of the task in a node, and number of GPUs equals to 8, using a whole node. Given that we defined the script in the slurm file to use 3 nodes, this will use all the 3 available nodes, with 8 GPUs for each of these tasks. -
Prepare the columns in the format expected by TRL for training. Click to see the slurm file used to run the previous pipeline. It's our go to slurm file, using 3 8xH100 nodes. Slurm file #!/bin/bash\n#SBATCH --job-name=math-shepherd-test-ray\n#SBATCH --partition=hopper-prod\n#SBATCH --qos=normal\n#SBATCH --nodes=3\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1\n#SBATCH --gpus-per-node=8\n#SBATCH --output=./logs/%x-%j.out\n#SBATCH --err=./logs/%x-%j.err\n#SBATCH --time=48:00:00\n\nset -ex\n\nmodule load cuda/12.1\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\nsource .venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nsrun --nodes=1 --ntasks=1 -w \"$head_node\" \\\n ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n --dashboard-host=0.0.0.0 \\\n --dashboard-port=8265 \\\n --temp-dir=\"$head_tmp_dir\" \\\n --block &\n\n# Give some time to head node to start...\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n node_i=${nodes_array[$i]}\n worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n echo \"Starting WORKER $i at $node_i\"\n srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n ray start --address \"$ip_head\" \\\n --temp-dir=\"$worker_tmp_dir\" \\\n --block &\n sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\nsleep 60\n\n# Finally submit the job to the cluster\nRAY_ADDRESS=\"http://$head_node_ip:8265\" ray job submit --working-dir pipeline -- python -u pipeline_math_shepherd_ray.py\n Final dataset The resulting dataset can be seen at: plaguss/test_math_shepherd_prm_ray. "},{"location":"sections/pipeline_samples/papers/prometheus/","title":"Prometheus 2","text":"\"Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models\" presents Prometheus 2, a new and more powerful evaluator LLM compared to Prometheus (its predecessor) presented in \"Prometheus: Inducing Fine-grained Evaluation Capability in Language Models\"; since GPT-4, as well as other proprietary LLMs, are commonly used to assess the quality of the responses for various LLMs, but there are concerns about transparency, controllability, and affordability, that motivate the need of open-source LLMs specialized in evaluations. Existing open evaluator LMs exhibit critical shortcomings: - They issue scores that significantly diverge from those assigned by humans.
- They lack the flexibility to perform both direct assessment and pairwise ranking, the two most prevalent forms of assessment.
Additionally, they do not possess the ability to evaluate based on custom evaluation criteria, focusing instead on general attributes like helpfulness and harmlessness. Prometheus 2 is capable of processing both direct assessment and pair-wise ranking formats grouped with user-defined evaluation criteria. Prometheus 2 released two variants: prometheus-eval/prometheus-7b-v2.0 : fine-tuned on top of mistralai/Mistral-7B-Instruct-v0.2 prometheus-eval/prometheus-8x7b-v2.0 : fine-tuned on top of mistralai/Mixtral-8x7B-Instruct-v0.1 Both models have been fine-tuned for both direct assessment and pairwise ranking tasks i.e. assessing the quality of a single isolated response for a given instruction with or without a reference answer and assessing the quality of one response against another one for a given instruction with or without a reference answer, respectively. On four direct assessment benchmarks and four pairwise ranking benchmarks, Prometheus 2 scores the highest correlation and agreement with humans and proprietary LM judges among all tested open evaluator LMs. Their models, code, and data are all publicly available at prometheus-eval/prometheus-eval . "},{"location":"sections/pipeline_samples/papers/prometheus/#replication","title":"Replication","text":"Note The section is named Replication but in this case we're not replicating the Prometheus 2 paper per se, but rather showing how to use the PrometheusEval task implemented within distilabel to evaluate the quality of the responses from a given instruction using the Prometheus 2 model. To showcase Prometheus 2 we will be using the PrometheusEval task implemented in distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. "},{"location":"sections/pipeline_samples/papers/prometheus/#installation","title":"Installation","text":"To reproduce the code below, one will need to install distilabel as it follows: pip install \"distilabel[vllm]>=1.1.0\"\n Alternatively, it's recommended to install Dao-AILab/flash-attention to benefit from Flash Attention 2 speed ups during inference via vllm . pip install flash-attn --no-build-isolation\n Note The installation notes above assume that you are using a VM with one GPU accelerator with at least the required VRAM to fit prometheus-eval/prometheus-7b-v2.0 in bfloat16 (28GB); but if you have enough VRAM to fit their 8x7B model in bfloat16 (~90GB) you can use prometheus-eval/prometheus-8x7b-v2.0 instead. "},{"location":"sections/pipeline_samples/papers/prometheus/#building-blocks","title":"Building blocks","text":" -
LoadDataFromHub : GeneratorStep to load a dataset from the Hugging Face Hub. -
PrometheusEval : Task that assesses the quality of a response for a given instruction using any of the Prometheus 2 models. vLLM : LLM that loads a model from the Hugging Face Hub via vllm-project/vllm. Note Since the Prometheus 2 models use a slightly different chat template than mistralai/Mistral-7B-Instruct-v0.2 , we need to set the chat_template parameter to [INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST] so as to properly format the input for Prometheus 2. -
(Optional) KeepColumns : Task that keeps only the specified columns in the dataset, used to remove the undesired columns. "},{"location":"sections/pipeline_samples/papers/prometheus/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to see how Prometheus 2 can be used via distilabel . from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import KeepColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import PrometheusEval\n\nif __name__ == \"__main__\":\n with Pipeline(name=\"prometheus\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n repo_id=\"HuggingFaceH4/instruction-dataset\",\n split=\"test\",\n output_mappings={\"prompt\": \"instruction\", \"completion\": \"generation\"},\n )\n\n task = PrometheusEval(\n name=\"task\",\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\",\n reference=False,\n num_generations=1,\n group_generations=False,\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\"instruction\", \"generation\", \"feedback\", \"result\", \"model_name\"],\n )\n\n load_dataset >> task >> keep_columns\n Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n task.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n },\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"instruction-dataset-prometheus\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/papers/ultrafeedback/","title":"UltraFeedback","text":"UltraFeedback: Boosting Language Models with High-quality Feedback is a paper published by OpenBMB which proposes UltraFeedback , a large-scale, fine-grained, diverse preference dataset, used for training powerful reward models and critic models. UltraFeedback collects about 64k prompts from diverse resources (including UltraChat, ShareGPT, Evol-Instruct, TruthfulQA, FalseQA, and FLAN), then they use these prompts to query multiple LLMs (commercial models, Llama models ranging 7B to 70B, and non-Llama models) and generate four different responses for each prompt, resulting in a total of 256k samples i.e. the UltraFeedback will rate four responses on every OpenAI request. To collect high-quality preference and textual feedback, they design a fine-grained annotation instruction, which contains four different aspects, namely instruction-following, truthfulness, honesty and helpfulness (even though within the paper they also mention a fifth one named verbalized calibration). Finally, GPT-4 is used to generate the ratings for the generated responses to the given prompt using the previously mentioned aspects. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#replication","title":"Replication","text":"To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. Also for testing purposes we will just show how to evaluate the generated responses for a given prompt using a new global aspect named overall-rating defined by Argilla, that computes the average of the four aspects, so as to reduce number of requests to be sent to OpenAI, but note that all the aspects are implemented within distilabel and can be used instead for a more faithful reproduction. Besides that we will generate three responses for each instruction using three LLMs selected from a pool of six: HuggingFaceH4/zephyr-7b-beta , argilla/notus-7b-v1 , google/gemma-1.1-7b-it , meta-llama/Meta-Llama-3-8B-Instruct , HuggingFaceH4/zephyr-7b-gemma-v0.1 and mlabonne/UltraMerge-7B . "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#installation","title":"Installation","text":"To replicate UltraFeedback one will need to install distilabel as it follows: pip install \"distilabel[argilla,openai,vllm]>=1.0.0\"\n And since we will be using vllm we will need to use a VM with at least 6 NVIDIA GPUs with at least 16GB of memory each to run the text generation, and set the OPENAI_API_KEY environment variable value. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#building-blocks","title":"Building blocks","text":" LoadDataFromHub : Generator Step to load a dataset from the Hugging Face Hub. sample_n_steps : Function to create a routing_batch_function that samples n downstream steps for each batch generated by the upstream step. This is the key to replicate the LLM pooling mechanism described in the paper. TextGeneration : Task to generate responses for a given instruction using an LLM. vLLM : LLM that loads a model from the Hugging Face Hub using vllm . GroupColumns : Task that combines multiple columns into a single one i.e. from string to list of strings. Useful when there are multiple parallel steps that are connected to the same node. UltraFeedback : Task that generates ratings for the responses of a given instruction using the UltraFeedback prompt. OpenAILLM : LLM that loads a model from OpenAI. KeepColumns : Task to keep the desired columns while removing the not needed ones, as well as defining the order for those. - (optional)
PreferenceToArgilla : Task to optionally push the generated dataset to Argilla to do some further analysis and human annotation. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to replicate UltraFeedback. from distilabel.models import OpenAILLM, vLLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import (\n GroupColumns,\n KeepColumns,\n LoadDataFromHub,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nsample_three_llms = sample_n_steps(n=3)\n\n\nwith Pipeline(name=\"ultrafeedback-pipeline\") as pipeline:\n load_hub_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n batch_size=2,\n )\n\n text_generation_with_notus = TextGeneration(\n name=\"text_generation_with_notus\",\n llm=vLLM(model=\"argilla/notus-7b-v1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_zephyr = TextGeneration(\n name=\"text_generation_with_zephyr\",\n llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_gemma = TextGeneration(\n name=\"text_generation_with_gemma\",\n llm=vLLM(model=\"google/gemma-1.1-7b-it\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_zephyr_gemma = TextGeneration(\n name=\"text_generation_with_zephyr_gemma\",\n llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_llama = TextGeneration(\n name=\"text_generation_with_llama\",\n llm=vLLM(model=\"meta-llama/Meta-Llama-3-8B-Instruct\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_ultramerge = TextGeneration(\n name=\"text_generation_with_ultramerge\",\n llm=vLLM(model=\"mlabonne/UltraMerge-7B\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine_columns = GroupColumns(\n name=\"combine_columns\",\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n input_batch_size=2\n )\n\n ultrafeedback = UltraFeedback(\n name=\"ultrafeedback_openai\",\n llm=OpenAILLM(model=\"gpt-4-turbo-2024-04-09\"),\n aspect=\"overall-rating\",\n output_mappings={\"model_name\": \"ultrafeedback_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"instruction\",\n \"generations\",\n \"generation_models\",\n \"ratings\",\n \"rationales\",\n \"ultrafeedback_model\",\n ],\n )\n\n (\n load_hub_dataset\n >> sample_three_llms\n >> [\n text_generation_with_notus,\n text_generation_with_zephyr,\n text_generation_with_gemma,\n text_generation_with_llama,\n text_generation_with_zephyr_gemma,\n text_generation_with_ultramerge\n ]\n >> combine_columns\n >> ultrafeedback\n >> keep_columns\n )\n\n # Optional: Push the generated dataset to Argilla, but will need to `pip install argilla` first\n # push_to_argilla = PreferenceToArgilla(\n # name=\"push_to_argilla\",\n # api_url=\"<ARGILLA_API_URL>\",\n # api_key=\"<ARGILLA_API_KEY>\", # type: ignore\n # dataset_name=\"ultrafeedback\",\n # dataset_workspace=\"admin\",\n # num_generations=2,\n # )\n # keep_columns >> push_to_argilla\n Note As we're using a relative small dataset, we're setting a low batch_size and input_batch_size so we have more batches for the routing_batch_function i.e. we will have more variety on the LLMs used to generate the responses. When using a large dataset, it's recommended to use a larger batch_size and input_batch_size to benefit from the vLLM optimizations for larger batch sizes, which makes the pipeline execution faster. Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n load_hub_dataset.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation_with_notus.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_zephyr.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_gemma.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_llama.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_zephyr_gemma.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_ultramerge.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n ultrafeedback.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 2048,\n \"temperature\": 0.7,\n }\n },\n },\n }\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"ultrafeedback-instruction-dataset\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"sentence-transformers~=3.0\"\n Let's make the needed imports: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.steps import LoadDataFromHub\n\nfrom sentence_transformers import SentenceTransformer, CrossEncoder\nimport torch\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n Let's make the extra needed imports: import argilla as rg\n context = (\n\"\"\"\nThe text is a chunk from technical Python SDK documentation of Argilla.\nArgilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets.\nAlong with prose explanations, the text chunk may include code snippets and Python references.\n\"\"\"\n)\n llm = InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n tokenizer_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n)\n\nwith Pipeline(name=\"generate\") as pipeline:\n load_dataset = LoadDataFromHub(\n num_examples=15,\n output_mappings={\"chunks\": \"anchor\"},\n )\n generate_retrieval_pairs = GenerateSentencePair(\n name=\"generate_retrieval_pairs\",\n triplet=True,\n hard_negative=True,\n action=\"query\",\n llm=llm,\n input_batch_size=10,\n context=context,\n )\n generate_reranking_pairs = GenerateSentencePair(\n name=\"generate_reranking_pairs\",\n triplet=True,\n hard_negative=False, # to potentially generate non-relevant pairs\n action=\"semantically-similar\",\n llm=llm,\n input_batch_size=10,\n context=context,\n )\n\n load_dataset.connect(generate_retrieval_pairs, generate_reranking_pairs)\n Next, we can execute this using pipeline.run . We will provide some parameters to specific components within our pipeline. generation_kwargs = {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n}\n\ndistiset = pipeline.run( \n parameters={\n load_dataset.name: {\n \"repo_id\": \"plaguss/argilla_sdk_docs_raw_unstructured\",\n \"split\": \"train\",\n },\n generate_retrieval_pairs.name: generation_kwargs,\n generate_reranking_pairs.name: generation_kwargs,\n },\n use_cache=False, # False for demo\n)\n Data generation can be a expensive, so it is recommended to store the data somewhere. For now, we will store it on the Hugging Face Hub, using our push_to_hub method. distiset.push_to_hub(\"[your-owner-name]/example-retrieval-reranking-dataset\")\n We have got 2 different leaf/end nodes, therefore we've got a distil configurations we can access, one for the retrieval data, and one for the reranking data. Looking at these initial examples, we can see they nicely capture the essence of the chunks column but we will need to evaluate the quality of the data a bit more before we can use it for fine-tuning. model_id = \"Snowflake/snowflake-arctic-embed-m\" # Hugging Face model ID\n\nmodel_retrieval = SentenceTransformer(\n model_id, device=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n)\n Next, we will encode the generated text pairs and compute the similarities. from sklearn.metrics.pairwise import cosine_similarity\n\ndef get_embeddings(texts):\n vectors = model_retrieval.encode(texts)\n return [vector.tolist() for vector in vectors]\n\n\ndef get_similarities(vector_batch_a, vector_batch_b):\n similarities = []\n for vector_a, vector_b in zip(vector_batch_a, vector_batch_b):\n similarity = cosine_similarity([vector_a], [vector_b])[0][0]\n similarities.append(similarity)\n return similarities\n\ndef format_data_retriever(batch):# -> Any:\n batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n batch[\"positive-vector\"] = get_embeddings(batch[\"positive\"])\n batch[\"negative-vector\"] = get_embeddings(batch[\"negative\"]) \n batch[\"similarity-positive-negative\"] = get_similarities(batch[\"positive-vector\"], batch[\"negative-vector\"])\n batch[\"similarity-anchor-positive\"] = get_similarities(batch[\"anchor-vector\"], batch[\"positive-vector\"])\n batch[\"similarity-anchor-negative\"] = get_similarities(batch[\"anchor-vector\"], batch[\"negative-vector\"])\n return batch\n\ndataset_generate_retrieval_pairs = distiset[\"generate_retrieval_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n model_id = \"sentence-transformers/all-MiniLM-L12-v2\"\n\nmodel = CrossEncoder(model_id)\n Next, we will compute the similarity for the generated text pairs using the reranker. On top of that, we will compute an anchor-vector to allow for doing semantic search. def format_data_retriever(batch):# -> Any:\n batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n batch[\"similarity-positive-negative\"] = model.predict(zip(batch[\"positive-vector\"], batch[\"negative-vector\"]))\n batch[\"similarity-anchor-positive\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"positive-vector\"]))\n batch[\"similarity-anchor-negative\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"negative-vector\"]))\n return batch\n\ndataset_generate_reranking_pairs = distiset[\"generate_reranking_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n And voila, we have our proxies for quality evaluation which we can use to filter out the best and worst examples. First, we need to define the setting for our Argilla dataset. We will create two different datasets, one for the retrieval data and one for the reranking data to ensure our annotators can focus on the task at hand. import argilla as rg\nfrom argilla._exceptions import ConflictError\n\napi_key = \"ohh so secret\"\napi_url = \"https://[your-owner-name]-[your-space-name].hf.space\"\n\nclient = rg.Argilla(api_url=api_url, api_key=api_key)\n\nsettings = rg.Settings(\n fields=[\n rg.TextField(\"anchor\")\n ],\n questions=[\n rg.TextQuestion(\"positive\"),\n rg.TextQuestion(\"negative\"),\n rg.LabelQuestion(\n name=\"is_positive_relevant\",\n title=\"Is the positive query relevant?\",\n labels=[\"yes\", \"no\"],\n ),\n rg.LabelQuestion(\n name=\"is_negative_irrelevant\",\n title=\"Is the negative query irrelevant?\",\n labels=[\"yes\", \"no\"],\n )\n ],\n metadata=[\n rg.TermsMetadataProperty(\"filename\"),\n rg.FloatMetadataProperty(\"similarity-positive-negative\"),\n rg.FloatMetadataProperty(\"similarity-anchor-positive\"),\n rg.FloatMetadataProperty(\"similarity-anchor-negative\"),\n ],\n vectors=[\n rg.VectorField(\"anchor-vector\", dimensions=model.get_sentence_embedding_dimension())\n ]\n)\nrg_datasets = []\nfor dataset_name in [\"generate_retrieval_pairs\", \"generate_reranking_pairs\"]:\n ds = rg.Dataset(\n name=dataset_name,\n settings=settings\n )\n try:\n ds.create()\n except ConflictError:\n ds = client.datasets(dataset_name)\n rg_datasets.append(ds)\n Now, we've got our dataset definitions setup in Argilla, we can upload our data to Argilla. ds_datasets = [dataset_generate_retrieval_pairs, dataset_generate_reranking_pairs]\n\nrecords = []\n\nfor rg_dataset, ds_dataset in zip(rg_datasets, ds_datasets):\n for idx, entry in enumerate(ds_dataset):\n records.append(\n rg.Record(\n id=idx,\n fields={\"anchor\": entry[\"anchor\"]},\n suggestions=[\n rg.Suggestion(\"positive\", value=entry[\"positive\"], agent=\"gpt-4o\", type=\"model\"),\n rg.Suggestion(\"negative\", value=entry[\"negative\"], agent=\"gpt-4o\", type=\"model\"),\n ],\n metadata={\n \"filename\": entry[\"filename\"],\n \"similarity-positive-negative\": entry[\"similarity-positive-negative\"],\n \"similarity-anchor-positive\": entry[\"similarity-anchor-positive\"],\n \"similarity-anchor-negative\": entry[\"similarity-anchor-negative\"]\n },\n vectors={\"anchor-vector\": entry[\"anchor-vector\"]}\n )\n )\n rg_dataset.records.log(records)\n Now, we can explore the UI and add a final human touch to get he most out of our dataset. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation-for-fine-tuning-custom-retrieval-and-reranking-models","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":" - Goal: Bootstrap, optimize and maintain your embedding models and rerankers through synthetic data generation and human feedback.
- Libraries: argilla, hf-inference-endpoints, sentence-transformers
- Components: LoadDataFromHub, GenerateSentencePair, InferenceEndpointsLLM
Note For a comprehensive overview on optimizing the retrieval performance in a RAG pipeline, check this guide in collaboration with ZenML, an open-source MLOps framework designed for building portable and production-ready machine learning pipelines. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#the-dataset","title":"The dataset","text":"Before starting any project, it is always important to look at your data. Our data is publicly available on the Hugging Face Hub so we can have a quick look through their dataset viewer within an embedded iFrame. As we can see, our dataset contains a column called chunks , which was obtained from the Argilla docs. Normally, you would need to download and chunk the data but we will not cover that in this tutorial. To read a full explanation for how this dataset was generated, please refer to How we leveraged distilabel to create an Argilla 2.0 Chatbot. Alternatively, we can load the entire dataset to disk with datasets.load_dataset . "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation","title":"Synthetic data generation","text":"The GenerateSentencePair component from distilabel can be used to generate training datasets for embeddings models. It is a pre-defined Task that given an anchor sentence generate data for a specific action . Supported actions are: \"paraphrase\", \"semantically-similar\", \"query\", \"answer\" . In our case the chunks column corresponds to the anchor . This means we will use query to generate potential queries for a fine-tuning a retrieval model and that we will use semantically-similar to generate texts that are similar to the intial anchor for fine-tuning a reranking model. We will triplet=True in order to generate both positive and negative examples, which should help the model generalize better during fine-tuning and we will set hard_negative=True to generate more challenging examples that are closer to the anchor and discussed topics. Lastly, we can seed the LLM with context to generate more relevant examples. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval","title":"Retrieval","text":"For retrieval, we will thus generate queries that are similar to the chunks column. We will use the query action to generate potential queries for a fine-tuning a retrieval model. generate_sentence_pair = GenerateSentencePair(\n triplet=True, \n hard_negative=True,\n action=\"query\",\n llm=llm,\n input_batch_size=10,\n context=context,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking","title":"Reranking","text":"For reranking, we will generate texts that are similar to the intial anchor. We will use the semantically-similar action to generate texts that are similar to the intial anchor for fine-tuning a reranking model. In this case, we set hard_negative=False to generate more diverse and potentially wrong examples, which can be used as negative examples for similarity fine-tuning because rerankers cannot be fine-tuned using triplets. generate_sentence_pair = GenerateSentencePair(\n triplet=True,\n hard_negative=False,\n action=\"semantically-similar\",\n llm=llm,\n input_batch_size=10,\n context=context,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#combined-pipeline","title":"Combined pipeline","text":"We will now use the GenerateSentencePair task to generate synthetic data for both retrieval and reranking models in a single pipeline. Note that, we map the chunks column to the anchor argument. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#data-quality-evaluation","title":"Data quality evaluation","text":"Data is never as clean as it can be and this also holds for synthetically generated data too, therefore, it is always good to spent some time and look at your data. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#feature-engineering","title":"Feature engineering","text":"In order to evaluate the quality of our data we will use features of the models that we intent to fine-tune as proxy for data quality. We can then use these features to filter out the best examples. In order to choose a good default model, we will use the Massive Text Embedding Benchmark (MTEB) Leaderboard. We want to optimize for size and speed, so we will set model size <100M and then filter for Retrieval and Reranking based on the highest average score, resulting in Snowflake/snowflake-arctic-embed-s and sentence-transformers/all-MiniLM-L12-v2 respectively. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_1","title":"Retrieval","text":"For retrieval, we will compute similarities for the current embeddings of anchor-positive , positive-negative and anchor-negative pairs. We assume that an overlap of these similarities will cause the model to have difficulties generalizing and therefore we can use these features to evaluate the quality of our data. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_1","title":"Reranking","text":"For reranking, we will compute the compute the relevance scores from an existing reranker model for anchor-positive , positive-negative and anchor-negative pais and make a similar assumption as for the retrieval model. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-argilla","title":"(Optional) Argilla","text":"To get the most out of you data and actually look at our data, we will use Argilla. If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space. To start exploring data, we first need to define an argilla.Dataset . We will create a basic datset with some input TextFields for the anchor and output TextQuestions for the positive and negative pairs. Additionally, we will use the file_name as MetaDataProperty . Lastly, we will be re-using the vectors obtained from our previous step to allow for semantic search and we will add te similarity scores for some basic filtering and sorting. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#fine-tuning","title":"Fine-tuning","text":"At last, we can fine-tune our models. We will use the sentence-transformers library to fine-tune our models. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_2","title":"Retrieval","text":"For retrieval, we have created a script that fine-tunes a model on our generated data the generated data based https://github.com/argilla-io/argilla-sdk-chatbot/blob/main/train_embedding.ipynb.You can also open it in Google Colab directly. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_2","title":"Reranking","text":"For reranking, sentence-transformers provides a script that shows how to fine-tune a CrossEncoder models. Ad of now, there is some uncertainty over fine-tuning CrossEncoder models with triplets but you can still use the positive and anchor "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#conclusions","title":"Conclusions","text":"In this tutorial, we present an end-to-end example of fine-tuning retrievers and rerankers for RAG. This serves as a good starting point for optimizing and maintaining your data and model but need to be adapted to your specific use case. We started with some seed data from the Argilla docs, generated synthetic data for retrieval and reranking models, evaluated the quality of the data, and showed how to fine-tune the models. We also used Argilla to get a human touch on the data. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/","title":"Clean an existing preference dataset","text":" - Goal: Clean an existing preference dataset by providing AI feedback on the quality of the data.
- Libraries: argilla, hf-inference-endpoints
- Components: LoadDataFromDicts, UltraFeedback, KeepColumns, PreferenceToArgilla, InferenceEndpointsLLM, GlobalStep
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.0\" \"torch~=2.0\"\n Let's make the required imports: import random\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n KeepColumns,\n LoadDataFromDicts,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import UltraFeedback\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n In this case, we will clean a preference dataset, so we will use the Intel/orca_dpo_pairs dataset from the Hugging Face Hub. dataset = load_dataset(\"Intel/orca_dpo_pairs\", split=\"train[:20]\")\n Next, we will shuffle the chosen and rejected columns to avoid any bias in the dataset. def shuffle_and_track(chosen, rejected):\n pair = [chosen, rejected]\n random.shuffle(pair)\n order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n return {\"generations\": pair, \"order\": order}\n\ndataset = dataset.map(lambda x: shuffle_and_track(x[\"chosen\"], x[\"rejected\"]))\n dataset = dataset.to_list()\n As a custom step You can also create a custom step in a separate module, import it and add it to the pipeline after loading the orca_dpo_pairs dataset using the LoadDataFromHub step. shuffle_step.pyfrom typing import TYPE_CHECKING, List\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\nimport random\n\nclass ShuffleStep(GlobalStep):\n @property\n def inputs(self):\n \"\"\"Returns List[str]: The inputs of the step.\"\"\"\n return [\"instruction\", \"chosen\", \"rejected\"]\n\n @property\n def outputs(self):\n \"\"\"Returns List[str]: The outputs of the step.\"\"\"\n return [\"instruction\", \"generations\", \"order\"]\n\n def process(self, inputs: StepInput):\n \"\"\"Returns StepOutput: The outputs of the step.\"\"\"\n outputs = []\n\n for input in inputs:\n chosen = input[\"chosen\"]\n rejected = input[\"rejected\"]\n pair = [chosen, rejected]\n random.shuffle(pair)\n order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n\n outputs.append({\"instruction\": input[\"instruction\"], \"generations\": pair, \"order\": order})\n\n yield outputs\n from shuffle_step import ShuffleStep\n To clean an existing preference dataset, we will need to define a Pipeline with all the necessary steps. However, a similar workflow can be used to clean a SFT dataset. Below, we will go over each step in detail. load_dataset = LoadDataFromDicts(\n data=dataset[:1],\n output_mappings={\"question\": \"instruction\"},\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nload_dataset.load()\nnext(load_dataset.process())\n \n([{'system': '',\n 'question': \"You will be given a definition of a task first, then some input of the task.\\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\\n\\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\\nOutput:\",\n 'chosen': '[\\n [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]',\n 'rejected': \" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n 'generations': [\" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n '[\\n [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]'],\n 'order': ['rejected', 'chosen']}],\n True) \n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n evaluate_responses.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'ratings': [5, 1],\n 'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n 'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] \n keep_columns = KeepColumns(\n columns=[\n \"instruction\",\n \"generations\",\n \"order\",\n \"ratings\",\n \"rationales\",\n \"model_name\",\n ],\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nkeep_columns.load()\nnext(\n keep_columns.process(\n [\n {\n \"system\": \"\",\n \"instruction\": \"What's the capital of Spain?\",\n \"chosen\": \"Madrid\",\n \"rejected\": \"Barcelona\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n \"order\": [\"chosen\", \"rejected\"],\n \"ratings\": [5, 1],\n \"rationales\": [\"\", \"\"],\n \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'order': ['chosen', 'rejected'],\n 'ratings': [5, 1],\n 'rationales': ['', ''],\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] \n to_argilla = PreferenceToArgilla(\n dataset_name=\"cleaned-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n)\n Below, you can see the full pipeline definition: with Pipeline(name=\"clean-dataset\") as pipeline:\n\n load_dataset = LoadDataFromDicts(\n data=dataset, output_mappings={\"question\": \"instruction\"}\n )\n\n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n )\n\n keep_columns = KeepColumns(\n columns=[\n \"instruction\",\n \"generations\",\n \"order\",\n \"ratings\",\n \"rationales\",\n \"model_name\",\n ]\n )\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"cleaned-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2,\n )\n\n load_dataset.connect(evaluate_responses)\n evaluate_responses.connect(keep_columns)\n keep_columns.connect(to_argilla)\n Let's now run the pipeline and clean our preference dataset. distiset = pipeline.run()\n Let's check it! If you have loaded the data to Argilla, you can start annotating in the Argilla UI. You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-cleaned-preference-dataset\")\n In this tutorial, we showcased the detailed steps to build a pipeline for cleaning a preference dataset using distilabel. However, you can customize this pipeline for your own use cases, such as cleaning an SFT dataset or adding custom steps. We used a preference dataset as our starting point and shuffled the data to avoid any bias. Next, we evaluated the responses using a model through the serverless Hugging Face Inference API, following the UltraFeedback standards. Finally, we kept the needed columns and used Argilla for further curation. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#clean-an-existing-preference-dataset","title":"Clean an existing preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#getting-started","title":"Getting Started","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#load-the-dataset","title":"Load the dataset","text":"We will use the dataset we just shuffled as source data. - Component:
LoadDataFromDicts - Input columns:
system , question , chosen , rejected , generations and order , the same keys as in the loaded list of dictionaries. - Output columns:
system , instruction , chosen , rejected , generations and order . We will use output_mappings to rename the columns. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"To evaluate the quality of the responses, we will use meta-llama/Meta-Llama-3.1-70B-Instruct , applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). For an SFT dataset, you can use PrometheusEval instead. - Component:
UltraFeedback task with LLMs using InferenceEndpointsLLM - Input columns:
instruction , generations - Output columns:
ratings , rationales , distilabel_metadata , model_name For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#keep-only-the-required-columns","title":"Keep only the required columns","text":"We will get rid of the unneeded columns. - Component:
KeepColumns - Input columns:
system , instruction , chosen , rejected , generations , ratings , rationales , distilabel_metadata and model_name - Output columns:
instruction , chosen , rejected , generations and order "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-further-data-curation","title":"(Optional) Further data curation","text":"You can use Argilla to further curate your data. - Component:
PreferenceToArgilla step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
instruction , generations , generation_models , ratings "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/","title":"Generate a preference dataset","text":" - Goal: Generate a synthetic preference dataset for DPO/ORPO.
- Libraries: argilla, hf-inference-endpoints
- Components: LoadDataFromHub, TextGeneration, UltraFeedback, GroupColumns, FormatTextGenerationDPO, PreferenceToArgilla, InferenceEndpointsLLM
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.0\" \"torch~=2.0\"\n Let's make the required imports: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n LoadDataFromHub,\n GroupColumns,\n FormatTextGenerationDPO,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n To generate our preference dataset, we will need to define a Pipeline with all the necessary steps. Below, we will go over each step in detail. load_dataset = LoadDataFromHub(\n repo_id= \"argilla/10Kprompts-mini\",\n num_examples=1,\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n )\nload_dataset.load()\nnext(load_dataset.process())\n \n([{'instruction': 'How can I create an efficient and robust workflow that utilizes advanced automation techniques to extract targeted data, including customer information, from diverse PDF documents and effortlessly integrate it into a designated Google Sheet? Furthermore, I am interested in establishing a comprehensive and seamless system that promptly activates an SMS notification on my mobile device whenever a new PDF document is uploaded to the Google Sheet, ensuring real-time updates and enhanced accessibility.',\n 'topic': 'Software Development'}],\n True) \n generate_responses = [\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n ),\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n ),\n]\nfor task in generate_responses:\n task.load()\n print(next(task.process([{\"instruction\": \"Which are the top cities in Spain?\"}])))\n \n[{'instruction': 'Which are the top cities in Spain?', 'generation': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.', 'distilabel_metadata': {'raw_output_text_generation_0': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.'}, 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}]\n[{'instruction': 'Which are the top cities in Spain?', 'generation': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.', 'distilabel_metadata': {'raw_output_text_generation_0': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.'}, 'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1'}]\n \n group_responses = GroupColumns(\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nnext(\n group_responses.process(\n [\n {\n \"generation\": \"Madrid\",\n \"model_name\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n },\n ],\n [\n {\n \"generation\": \"Barcelona\",\n \"model_name\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n }\n ],\n )\n)\n \n[{'generations': ['Madrid', 'Barcelona'],\n 'model_names': ['meta-llama/Meta-Llama-3-8B-Instruct',\n 'mistralai/Mixtral-8x7B-Instruct-v0.1']}] \n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n evaluate_responses.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'ratings': [5, 1],\n 'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n 'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}] \n format_dpo = FormatTextGenerationDPO(pipeline=Pipeline(name=\"showcase-pipeline\"))\nformat_dpo.load()\nnext(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n \"generation_models\": [\n \"Meta-Llama-3-8B-Instruct\",\n \"Mixtral-8x7B-Instruct-v0.1\",\n ],\n \"ratings\": [5, 1],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'generation_models': ['Meta-Llama-3-8B-Instruct',\n 'Mixtral-8x7B-Instruct-v0.1'],\n 'ratings': [5, 1],\n 'prompt': \"What's the capital of Spain?\",\n 'prompt_id': '26174c953df26b3049484e4721102dca6b25d2de9e3aa22aa84f25ed1c798512',\n 'chosen': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n {'role': 'assistant', 'content': 'Madrid'}],\n 'chosen_model': 'Meta-Llama-3-8B-Instruct',\n 'chosen_rating': 5,\n 'rejected': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n {'role': 'assistant', 'content': 'Barcelona'}],\n 'rejected_model': 'Mixtral-8x7B-Instruct-v0.1',\n 'rejected_rating': 1}] \n - Or you can use Argilla to manually label the data and convert it to a preference dataset.
- Component:
PreferenceToArgilla step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
instruction , generations , generation_models , ratings to_argilla = PreferenceToArgilla(\n dataset_name=\"preference-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n)\n Below, you can see the full pipeline definition: with Pipeline(name=\"generate-dataset\") as pipeline:\n\n load_dataset = LoadDataFromHub(repo_id=\"argilla/10Kprompts-mini\")\n\n generate_responses = [\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n ),\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n ),\n ]\n\n group_responses = GroupColumns(\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n )\n\n format_dpo = FormatTextGenerationDPO()\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"preference-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n )\n\n for task in generate_responses:\n load_dataset.connect(task)\n task.connect(group_responses)\n group_responses.connect(evaluate_responses)\n evaluate_responses.connect(format_dpo, to_argilla)\n Let's now run the pipeline and generate the preference dataset. distiset = pipeline.run()\n Let's check the preference dataset! If you have loaded the data to Argilla, you can start annotating in the Argilla UI. You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-preference-dataset\")\n In this tutorial, we showcased the detailed steps to build a pipeline for generating a preference dataset using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub, or use them to train a model for DPO or ORPO. We used a dataset containing prompts to generate responses using two different models through the serverless Hugging Face Inference API. Next, we evaluated the responses using a third model, following the UltraFeedback standards. Finally, we converted the data to a preference dataset and used Argilla for further curation. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-a-preference-dataset","title":"Generate a preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#load-the-dataset","title":"Load the dataset","text":"We will use as source data the argilla/10Kprompts-mini dataset from the Hugging Face Hub. - Component:
LoadDataFromHub - Input columns:
instruction and topic , the same as in the loaded dataset - Output columns:
instruction and topic "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-responses","title":"Generate responses","text":"We need to generate the responses for the given instructions. We will use two different models available on the Hugging Face Hub through the Serverless Inference API: meta-llama/Meta-Llama-3-8B-Instruct and mistralai/Mixtral-8x7B-Instruct-v0.1 . We will also indicate the generation parameters for each model. - Component:
TextGeneration task with LLMs using InferenceEndpointsLLM - Input columns:
instruction - Output columns:
generation , distilabel_metadata , model_name for each model For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#group-the-responses","title":"Group the responses","text":"The task to evaluate the responses needs as input a list of generations. However, each model response was saved in the generation column of the subsets text_generation_0 and text_generation_1 . We will combine these two columns into a single column and the default subset. - Component:
GroupColumns - Input columns:
generation and model_name from text_generation_0 and text_generation_1 - Output columns:
generations and model_names "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"To build our preference dataset, we need to evaluate the responses generated by the models. We will use meta-llama/Meta-Llama-3-70B-Instruct for this, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). - Component:
UltraFeedback task with LLMs using InferenceEndpointsLLM - Input columns:
instruction , generations - Output columns:
ratings , rationales , distilabel_metadata , model_name For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#convert-to-a-preference-dataset","title":"Convert to a preference dataset","text":" - You can automatically convert it to a preference dataset with the
chosen and rejected columns. - Component:
FormatTextGenerationDPO step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/","title":"Generate synthetic text classification data","text":" - Goal: Generate synthetic text classification data to augment an imbalanced and limited dataset for training a topic classifier. In addition, generate new data for training a fact-based versus opinion-based classifier to add a new label.
- Libraries: argilla, hf-inference-endpoints, SetFit
- Components: LoadDataFromDicts, EmbeddingTaskGenerator, GenerateTextClassificationData
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.40\" \"torch~=2.0\" \"setfit~=1.0\"\n Let's make the required imports: import random\nfrom collections import Counter\n\nfrom datasets import load_dataset, Dataset\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n GenerateTextClassificationData,\n)\nfrom setfit import SetFitModel, Trainer, sample_dataset\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n We will use the fancyzhx/ag_news dataset from the Hugging Face Hub as our original data source. To simulate a real-world scenario with imbalanced and limited data, we will load only 20 samples from this dataset. hf_dataset = load_dataset(\"fancyzhx/ag_news\", split=\"train[-20:]\")\n Now, we can retrieve the available labels in the dataset and examine the current data distribution. labels_topic = hf_dataset.features[\"label\"].names\nid2str = {i: labels_topic[i] for i in range(len(labels_topic))}\nprint(id2str)\nprint(Counter(hf_dataset[\"label\"]))\n \n{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}\nCounter({0: 12, 1: 6, 2: 2})\n \n As observed, the dataset is imbalanced, with most samples falling under the World category, while the Sci/Tech category is entirely missing. Moreover, there are insufficient samples to effectively train a topic classification model. We will also define the labels for the new classification task. labels_fact_opinion = [\"Fact-based\", \"Opinion-based\"]\n To generate the data we will use the GenerateTextClassificationData task. This task will use as input classification tasks and we can define the language, difficulty and clarity required for the generated data. task = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"college\",\n clarity=\"clear\",\n num_generations=1,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.4},\n ),\n input_batch_size=5,\n)\ntask.load()\nresult = next(\n task.process([{\"task\": \"Classify the news article as fact-based or opinion-based\"}])\n)\nprint(result[0][\"distilabel_metadata\"][\"raw_input_generate_text_classification_data_0\"])\n \n[{'role': 'user', 'content': 'You have been assigned a text classification task: Classify the news article as fact-based or opinion-based\\n\\nYour mission is to write one text classification example for this task in JSON format. The JSON object must contain the following keys:\\n - \"input_text\": a string, the input text specified by the classification task.\\n - \"label\": a string, the correct label of the input text.\\n - \"misleading_label\": a string, an incorrect label that is related to the task.\\n\\nPlease adhere to the following guidelines:\\n - The \"input_text\" should be diverse in expression.\\n - The \"misleading_label\" must be a valid label for the given task, but not as appropriate as the \"label\" for the \"input_text\".\\n - The values for all fields should be in English.\\n - Avoid including the values of the \"label\" and \"misleading_label\" fields in the \"input_text\", that would make the task too easy.\\n - The \"input_text\" is clear and requires college level education to comprehend.\\n\\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'}]\n \n For our use case, we only need to generate data for two tasks: a topic classification task and a fact versus opinion classification task. Therefore, we will define the tasks accordingly. As we will be using an smaller model for generation, we will select 2 random labels for each topic classification task and change the order for the fact versus opinion classification task ensuring more diversity in the generated data. task_templates = [\n \"Determine the news article as {}\",\n \"Classify news article as {}\",\n \"Identify the news article as {}\",\n \"Categorize the news article as {}\",\n \"Label the news article using {}\",\n \"Annotate the news article based on {}\",\n \"Determine the theme of a news article from {}\",\n \"Recognize the topic of the news article as {}\",\n]\n\nclassification_tasks = [\n {\"task\": action.format(\" or \".join(random.sample(labels_topic, 2)))}\n for action in task_templates for _ in range(4)\n] + [\n {\"task\": action.format(\" or \".join(random.sample(labels_fact_opinion, 2)))}\n for action in task_templates\n]\n Now, it's time to define and run the pipeline. As mentioned, we will load the written tasks and feed them into the GenerateTextClassificationData task. For our use case, we will be using Meta-Llama-3.1-8B-Instruct via the InferenceEndpointsLLM , with different degrees of difficulty and clarity. difficulties = [\"college\", \"high school\", \"PhD\"]\nclarity = [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n\nwith Pipeline(\"texcat-generation-pipeline\") as pipeline:\n\n tasks_generator = LoadDataFromDicts(data=classification_tasks)\n\n generate_data = []\n for difficulty in difficulties:\n for clarity_level in clarity:\n task = GenerateTextClassificationData(\n language=\"English\",\n difficulty=difficulty,\n clarity=clarity_level,\n num_generations=2,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n input_batch_size=5,\n )\n generate_data.append(task)\n\n for task in generate_data:\n tasks_generator.connect(task)\n Let's now run the pipeline and generate the synthetic data. distiset = pipeline.run()\n distiset[\"generate_text_classification_data_0\"][\"train\"][0]\n \n{'task': 'Determine the news article as Business or World',\n 'input_text': \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone's economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\n 'label': 'Business',\n 'misleading_label': 'World',\n 'distilabel_metadata': {'raw_output_generate_text_classification_data_0': '{\\n \"input_text\": \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone\\'s economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\\n \"label\": \"Business\",\\n \"misleading_label\": \"World\"\\n}'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct'} \n You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-texcat-generation-dataset\")\n By examining the distiset distribution, we can confirm that it includes at least the 8 required samples for each label to train our classification models with SetFit. all_labels = [\n entry[\"label\"]\n for dataset_name in distiset\n for entry in distiset[dataset_name][\"train\"]\n]\n\nCounter(all_labels)\n \nCounter({'Sci/Tech': 275,\n 'Business': 130,\n 'World': 86,\n 'Fact-based': 86,\n 'Sports': 64,\n 'Opinion-based': 54,\n None: 20,\n 'Opinion Based': 1,\n 'News/Opinion': 1,\n 'Science': 1,\n 'Environment': 1,\n 'Opinion': 1}) \n We will create two datasets with the required labels and data for our use cases. def extract_rows(distiset, labels):\n return [\n {\n \"text\": entry[\"input_text\"],\n \"label\": entry[\"label\"],\n \"id\": i\n }\n for dataset_name in distiset\n for i, entry in enumerate(distiset[dataset_name][\"train\"])\n if entry[\"label\"] in labels\n ]\n\ndata_topic = extract_rows(distiset, labels_topic)\ndata_fact_opinion = extract_rows(distiset, labels_fact_opinion)\n Get started in Argilla If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space. To get the most out of our data, we will use Argilla. First, we need to connect to the Argilla instance. import argilla as rg\n\n# Replace api_url with your url if using Docker\n# Replace api_key with your API key under \"My Settings\" in the UI\n# Uncomment the last line and set your HF_TOKEN if your space is private\nclient = rg.Argilla(\n api_url=\"https://[your-owner-name]-[your_space_name].hf.space\",\n api_key=\"[your-api-key]\",\n # headers={\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n)\n We will create a Dataset for each task, with an input TextField for the text classification text and a LabelQuestion to ensure the generated labels are correct. def create_texcat_dataset(dataset_name, labels):\n settings = rg.Settings(\n fields=[rg.TextField(\"text\")],\n questions=[\n rg.LabelQuestion(\n name=\"label\",\n title=\"Classify the texts according to the following labels\",\n labels=labels,\n ),\n ],\n )\n return rg.Dataset(name=dataset_name, settings=settings).create()\n\n\nrg_dataset_topic = create_texcat_dataset(\"topic-classification\", labels_topic)\nrg_dataset_fact_opinion = create_texcat_dataset(\n \"fact-opinion-classification\", labels_fact_opinion\n)\n Now, we can upload the generated data to Argilla and evaluate it. We will use the generated labels as suggestions. rg_dataset_topic.records.log(data_topic)\nrg_dataset_fact_opinion.records.log(data_fact_opinion)\n Now, we can start the annotation process. Just open the dataset in the Argilla UI and start annotating the records. If the suggestions are correct, you can just click on Submit . Otherwise, you can select the correct label. Note Check this how-to guide to know more about annotating in the UI. Once, you get the annotations, let's continue by retrieving the data from Argilla and format it as a dataset with the required data. rg_dataset_topic = client.datasets(\"topic-classification\")\nrg_dataset_fact_opinion = client.datasets(\"fact-opinion-classification\")\n status_filter = rg.Query(filter=rg.Filter((\"response.status\", \"==\", \"submitted\")))\n\nsubmitted_topic = rg_dataset_topic.records(status_filter).to_list(flatten=True)\nsubmitted_fact_opinion = rg_dataset_fact_opinion.records(status_filter).to_list(\n flatten=True\n)\n def format_submitted(submitted):\n return [\n {\n \"text\": r[\"text\"],\n \"label\": r[\"label.responses\"][0],\n \"id\": i,\n }\n for i, r in enumerate(submitted)\n ]\n\ndata_topic = format_submitted(submitted_topic)\ndata_fact_opinion = format_submitted(submitted_fact_opinion)\n In our case, we will fine-tune using SetFit. However, you can select the one that best fits your requirements. The next step will be to format the data to be compatible with SetFit. In the case of the topic classification, we will need to combine the synthetic data with the original data. hf_topic = hf_dataset.to_list()\nnum = len(data_topic)\n\ndata_topic.extend(\n [\n {\n \"text\": r[\"text\"],\n \"label\": id2str[r[\"label\"]],\n \"id\": num + i,\n }\n for i, r in enumerate(hf_topic)\n ]\n)\n If we check the data distribution now, we can see that we have enough samples for each label to train our models. labels = [record[\"label\"] for record in data_topic]\nCounter(labels)\n \nCounter({'Sci/Tech': 275, 'Business': 132, 'World': 98, 'Sports': 70}) \n labels = [record[\"label\"] for record in data_fact_opinion]\nCounter(labels)\n \nCounter({'Fact-based': 86, 'Opinion-based': 54}) \n Now, let's create our training and validation datasets. The training dataset will gather 8 samples by label. In this case, the validation datasets will contain the remaining samples not included in the training datasets. def sample_and_split(dataset, label_column, num_samples):\n train_dataset = sample_dataset(\n dataset, label_column=label_column, num_samples=num_samples\n )\n eval_dataset = dataset.filter(lambda x: x[\"id\"] not in set(train_dataset[\"id\"]))\n return train_dataset, eval_dataset\n\n\ndataset_topic_full = Dataset.from_list(data_topic)\ndataset_fact_opinion_full = Dataset.from_list(data_fact_opinion)\n\ntrain_dataset_topic, eval_dataset_topic = sample_and_split(\n dataset_topic_full, \"label\", 8\n)\ntrain_dataset_fact_opinion, eval_dataset_fact_opinion = sample_and_split(\n dataset_fact_opinion_full, \"label\", 8\n)\n Let's train our models for each task! We will use TaylorAI/bge-micro-v2, available in the Hugging Face Hub. You can check the MTEB leaderboard to select the best model for your use case. def train_model(model_name, dataset, eval_dataset):\n model = SetFitModel.from_pretrained(model_name)\n\n trainer = Trainer(\n model=model,\n train_dataset=dataset,\n )\n trainer.train()\n metrics = trainer.evaluate(eval_dataset)\n print(metrics)\n\n return model\n model_topic = train_model(\n model_name=\"TaylorAI/bge-micro-v2\",\n dataset=train_dataset_topic,\n eval_dataset=eval_dataset_topic,\n)\nmodel_topic.save_pretrained(\"topic_classification_model\")\nmodel_topic = SetFitModel.from_pretrained(\"topic_classification_model\")\n \n***** Running training *****\n Num unique pairs = 768\n Batch size = 16\n Num epochs = 1\n Total optimization steps = 48\n \n \n{'embedding_loss': 0.1873, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}\n \n \n***** Running evaluation *****\n \n \n{'train_runtime': 4.9767, 'train_samples_per_second': 154.318, 'train_steps_per_second': 9.645, 'epoch': 1.0}\n{'accuracy': 0.8333333333333334}\n \n model_fact_opinion = train_model(\n model_name=\"TaylorAI/bge-micro-v2\",\n dataset=train_dataset_fact_opinion,\n eval_dataset=eval_dataset_fact_opinion,\n)\nmodel_fact_opinion.save_pretrained(\"fact_opinion_classification_model\")\nmodel_fact_opinion = SetFitModel.from_pretrained(\"fact_opinion_classification_model\")\n \n***** Running training *****\n Num unique pairs = 144\n Batch size = 16\n Num epochs = 1\n Total optimization steps = 9\n \n \n{'embedding_loss': 0.2985, 'learning_rate': 2e-05, 'epoch': 0.11}\n \n \n***** Running evaluation *****\n \n \n{'train_runtime': 0.8327, 'train_samples_per_second': 172.931, 'train_steps_per_second': 10.808, 'epoch': 1.0}\n{'accuracy': 0.9090909090909091}\n \n Voil\u00e0! The models are now trained and ready to be used. You can start making predictions to check the model's performance and add the new label. Optionally, you can continue using distilabel to generate additional data or Argilla to verify the quality of the predictions. def predict(model, input, labels):\n model.labels = labels\n prediction = model.predict([input])\n return prediction[0]\n predict(\n model_topic, \"The new iPhone is expected to be released next month.\", labels_topic\n)\n \n'Sci/Tech' \n predict(\n model_fact_opinion,\n \"The new iPhone is expected to be released next month.\",\n labels_fact_opinion,\n)\n \n'Opinion-based' \n In this tutorial, we showcased the detailed steps to build a pipeline for generating text classification data using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub. We defined two text classification tasks\u2014a topic classification task and a fact versus opinion classification task\u2014and generated new data using various models via the serverless Hugging Face Inference API. Then, we curated the generated data with Argilla. Finally, we trained the models with SetFit using both the original and synthetic data. "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#generate-synthetic-text-classification-data","title":"Generate synthetic text classification data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#define-the-text-classification-task","title":"Define the text classification task","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-evaluate-with-argilla","title":"(Optional) Evaluate with Argilla","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#train-your-models","title":"Train your models","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#formatting-the-data","title":"Formatting the data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-actual-training","title":"The actual training","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#conclusions","title":"Conclusions","text":""},{"location":"components-gallery/","title":"Components Gallery","text":" -
Steps Explore all the available Step s that can be used for data manipulation. Steps -
Tasks Explore all the available Task s that can be used with an LLM to perform data generation, annotation, and more. Tasks -
LLMs Explore all the available LLM s integrated with distilabel . LLMs -
Embeddings Explore all the available Embeddings models integrated with distilabel . Embeddings "},{"location":"components-gallery/steps/","title":"Steps Gallery","text":"Category Overview The gallery page showcases the different types of components within distilabel . Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data. labelling Labelling steps are used to label the data. -
PreferenceToArgilla Creates a preference dataset in Argilla. PreferenceToArgilla -
TextGenerationToArgilla Creates a text generation dataset in Argilla. TextGenerationToArgilla -
CombineColumns CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. CombineColumns -
PushToHub Push data to a Hugging Face Hub dataset. PushToHub -
LoadDataFromDicts Loads a dataset from a list of dictionaries. LoadDataFromDicts -
DataSampler Step to sample from a dataset. DataSampler -
LoadDataFromHub Loads a dataset from the Hugging Face Hub. LoadDataFromHub -
LoadDataFromFileSystem Loads a dataset from a file in your filesystem. LoadDataFromFileSystem -
LoadDataFromDisk Load a dataset that was previously saved to disk. LoadDataFromDisk -
PrepareExamples Helper step to create examples from query and answers pairs used as Few Shots in APIGen. PrepareExamples -
ConversationTemplate Generate a conversation template from an instruction and a response. ConversationTemplate -
FormatTextGenerationDPO Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO -
FormatChatGenerationDPO Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO -
FormatTextGenerationSFT Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT -
FormatChatGenerationSFT Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT -
DeitaFiltering Filter dataset rows using DEITA filtering strategy. DeitaFiltering -
EmbeddingDedup Deduplicates text using embeddings. EmbeddingDedup -
APIGenExecutionChecker Executes the generated function calls. APIGenExecutionChecker -
MinHashDedup Deduplicates text using MinHash and MinHashLSH . MinHashDedup -
CombineOutputs Combine the outputs of several upstream steps. CombineOutputs -
ExpandColumns Expand columns that contain lists into multiple rows. ExpandColumns -
GroupColumns Combines columns from a list of StepInput . GroupColumns -
KeepColumns Keeps selected columns in the dataset. KeepColumns -
MergeColumns Merge columns from a row. MergeColumns -
DBSCAN DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core DBSCAN -
UMAP UMAP is a general purpose manifold learning and dimension reduction algorithm. UMAP -
FaissNearestNeighbour Create a faiss index to get the nearest neighbours. FaissNearestNeighbour -
EmbeddingGeneration Generate embeddings using an Embeddings model. EmbeddingGeneration -
RewardModelScore Assign a score to a response using a Reward Model. RewardModelScore -
FormatPRM Helper step to transform the data into the format expected by the PRM model. FormatPRM -
TruncateTextColumn Truncate a row using a tokenizer or the number of characters. TruncateTextColumn "},{"location":"components-gallery/steps/preferencetoargilla/","title":"PreferenceToArgilla","text":"Creates a preference dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations. "},{"location":"components-gallery/steps/preferencetoargilla/#note","title":"Note","text":"This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations , as the ratings and rationales are optional. "},{"location":"components-gallery/steps/preferencetoargilla/#attributes","title":"Attributes","text":" -
num_generations: The number of generations to include in the dataset. -
dataset_name: The name of the dataset in Argilla. -
dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. -
api_url: The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. -
api_key: The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. "},{"location":"components-gallery/steps/preferencetoargilla/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/steps/preferencetoargilla/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generations]\n ICOL2[ratings]\n ICOL3[rationales]\n end\n end\n\n subgraph PreferenceToArgilla\n StepInput[Input Columns: instruction, generations, ratings, rationales]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n "},{"location":"components-gallery/steps/preferencetoargilla/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction that was used to generate the completion. -
generations (List[str] ): The completion that was generated based on the input instruction. -
ratings (List[str] , optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla. -
rationales (List[str] , optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla. "},{"location":"components-gallery/steps/preferencetoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/preferencetoargilla/#push-a-preference-dataset-to-an-argilla-instance","title":"Push a preference dataset to an Argilla instance","text":"from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n "},{"location":"components-gallery/steps/preferencetoargilla/#it-can-also-include-ratings-and-rationales","title":"It can also include ratings and rationales","text":"result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'instruction',\n# 'generations': ['first_generation', 'second_generation'],\n# 'ratings': ['4', '5'],\n# 'rationales': ['rationale for 4', 'rationale for 5']\n# }\n# ]\n "},{"location":"components-gallery/steps/textgenerationtoargilla/","title":"TextGenerationToArgilla","text":"Creates a text generation dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d). "},{"location":"components-gallery/steps/textgenerationtoargilla/#note","title":"Note","text":"This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns. "},{"location":"components-gallery/steps/textgenerationtoargilla/#attributes","title":"Attributes","text":" -
dataset_name: The name of the dataset in Argilla. -
dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. -
api_url: The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. -
api_key: The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. "},{"location":"components-gallery/steps/textgenerationtoargilla/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n end\n end\n\n subgraph TextGenerationToArgilla\n StepInput[Input Columns: instruction, generation]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n "},{"location":"components-gallery/steps/textgenerationtoargilla/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#push-a-text-generation-dataset-to-an-argilla-instance","title":"Push a text generation dataset to an Argilla instance","text":"from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n "},{"location":"components-gallery/steps/combinecolumns/","title":"CombineColumns","text":"CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. "},{"location":"components-gallery/steps/combinecolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n end\n\n subgraph CombineColumns\n end\n\n "},{"location":"components-gallery/steps/pushtohub/","title":"PushToHub","text":"Push data to a Hugging Face Hub dataset. A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub. "},{"location":"components-gallery/steps/pushtohub/#attributes","title":"Attributes","text":" -
repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded. -
split: The split of the dataset that will be pushed. Defaults to \"train\" . -
private: Whether the dataset to be pushed should be private or not. Defaults to False . -
token: The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN . If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None . "},{"location":"components-gallery/steps/pushtohub/#runtime-parameters","title":"Runtime Parameters","text":" -
repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded. -
split: The split of the dataset that will be pushed. -
private: Whether the dataset to be pushed should be private or not. -
token: The token that will be used to authenticate in the Hub. "},{"location":"components-gallery/steps/pushtohub/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n end\n\n subgraph PushToHub\n StepInput[Input Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n "},{"location":"components-gallery/steps/pushtohub/#inputs","title":"Inputs","text":" - dynamic (
all ): all columns from the input will be used to create the dataset. "},{"location":"components-gallery/steps/pushtohub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/pushtohub/#push-batches-of-your-dataset-to-the-hugging-face-hub-repository","title":"Push batches of your dataset to the Hugging Face Hub repository","text":"from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n "},{"location":"components-gallery/steps/loaddatafromdicts/","title":"LoadDataFromDicts","text":"Loads a dataset from a list of dictionaries. GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches. "},{"location":"components-gallery/steps/loaddatafromdicts/#attributes","title":"Attributes","text":" - data: The list of dictionaries to load the data from.
"},{"location":"components-gallery/steps/loaddatafromdicts/#runtime-parameters","title":"Runtime Parameters","text":" - batch_size: The batch size to use when processing the data.
"},{"location":"components-gallery/steps/loaddatafromdicts/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromDicts\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromdicts/#outputs","title":"Outputs","text":" - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/loaddatafromdicts/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdicts/#load-data-from-a-list-of-dictionaries","title":"Load data from a list of dictionaries","text":"from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n "},{"location":"components-gallery/steps/datasampler/","title":"DataSampler","text":"Step to sample from a dataset. GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples. "},{"location":"components-gallery/steps/datasampler/#attributes","title":"Attributes","text":" -
data: The list of dictionaries to sample from. -
size: Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2. -
samples: Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100. "},{"location":"components-gallery/steps/datasampler/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph DataSampler\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/datasampler/#outputs","title":"Outputs","text":" - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/datasampler/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/datasampler/#sample-data-from-a-list-of-dictionaries","title":"Sample data from a list of dictionaries","text":"from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n "},{"location":"components-gallery/steps/datasampler/#pipeline-with-a-loader-and-a-sampler-combined-in-a-single-stream","title":"Pipeline with a loader and a sampler combined in a single stream","text":"from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n# Now we have a single stream of data with the loader and the sampler data\n "},{"location":"components-gallery/steps/loaddatafromhub/","title":"LoadDataFromHub","text":"Loads a dataset from the Hugging Face Hub. GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library. "},{"location":"components-gallery/steps/loaddatafromhub/#attributes","title":"Attributes","text":" -
repo_id: The Hugging Face Hub repository ID of the dataset to load. -
split: The split of the dataset to load. -
config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. "},{"location":"components-gallery/steps/loaddatafromhub/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
repo_id: The Hugging Face Hub repository ID of the dataset to load. -
split: The split of the dataset to load. Defaults to 'train'. -
config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. -
revision: The revision of the dataset to load. Defaults to the latest revision. -
streaming: Whether to load the dataset in streaming mode or not. Defaults to False . -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . "},{"location":"components-gallery/steps/loaddatafromhub/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromHub\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromhub/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromhub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromhub/#load-data-from-a-dataset-in-hugging-face-hub","title":"Load data from a dataset in Hugging Face Hub","text":"from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/","title":"LoadDataFromFileSystem","text":"Loads a dataset from a file in your filesystem. GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#attributes","title":"Attributes","text":" -
data_files: The path to the file, or directory containing the files that conform the dataset. -
split: The split of the dataset to load (typically will be train , test or validation ). "},{"location":"components-gallery/steps/loaddatafromfilesystem/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
data_files: The path to the file, or directory containing the files that conform the dataset. -
split: The split of the dataset to load. Defaults to 'train'. -
streaming: Whether to load the dataset in streaming mode or not. Defaults to False . -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . -
filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromFileSystem\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-hugging-face-dataset-in-your-file-system","title":"Load data from a Hugging Face dataset in your file system","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#specify-a-filetype-if-the-file-extension-is-not-expected","title":"Specify a filetype if the file extension is not expected","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-file-in-your-cloud-provider","title":"Load data from a file in your cloud provider","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-passing-a-glob-pattern","title":"Load data passing a glob pattern","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromdisk/","title":"LoadDataFromDisk","text":"Load a dataset that was previously saved to disk. If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class. "},{"location":"components-gallery/steps/loaddatafromdisk/#attributes","title":"Attributes","text":" -
dataset_path: The path to the dataset or distiset. -
split: The split of the dataset to load (typically will be train , test or validation ). -
config: The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. "},{"location":"components-gallery/steps/loaddatafromdisk/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
dataset_path: The path to the dataset or distiset. -
is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False. -
split: The split of the dataset to load. Defaults to 'train'. -
config: The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . "},{"location":"components-gallery/steps/loaddatafromdisk/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromDisk\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromdisk/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromdisk/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset","title":"Load data from a Hugging Face Dataset","text":"from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-distilabel-distiset","title":"Load data from a distilabel Distiset","text":"from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n "},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset-or-distiset-in-your-cloud-provider","title":"Load data from a Hugging Face Dataset or Distiset in your cloud provider","text":"from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/prepareexamples/","title":"PrepareExamples","text":"Helper step to create examples from query and answers pairs used as Few Shots in APIGen. "},{"location":"components-gallery/steps/prepareexamples/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[query]\n ICOL1[answers]\n end\n subgraph New columns\n OCOL0[examples]\n end\n end\n\n subgraph PrepareExamples\n StepInput[Input Columns: query, answers]\n StepOutput[Output Columns: examples]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/prepareexamples/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/prepareexamples/#outputs","title":"Outputs","text":" - examples (
str ): The formatted examples. "},{"location":"components-gallery/steps/prepareexamples/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/prepareexamples/#generate-examples-for-apigen","title":"Generate examples for APIGen","text":"from distilabel.steps.tasks.apigen.utils import PrepareExamples\n\nprepare_examples = PrepareExamples()\nresult = next(prepare_examples.process(\n [\n {\n \"query\": ['I need the area of circles with radius 2.5, 5, and 7.5 inches, please.', 'Can you provide the current locations of buses and trolleys on route 12?'],\n \"answers\": ['[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]', '[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]']\n }\n ]\n)\n# result\n# [{'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}, {'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}]\n "},{"location":"components-gallery/steps/conversationtemplate/","title":"ConversationTemplate","text":"Generate a conversation template from an instruction and a response. "},{"location":"components-gallery/steps/conversationtemplate/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n end\n subgraph New columns\n OCOL0[conversation]\n end\n end\n\n subgraph ConversationTemplate\n StepInput[Input Columns: instruction, response]\n StepOutput[Output Columns: conversation]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/conversationtemplate/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/conversationtemplate/#outputs","title":"Outputs","text":" - conversation (
ChatType ): The conversation template. "},{"location":"components-gallery/steps/conversationtemplate/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/conversationtemplate/#create-a-conversation-from-an-instruction-and-a-response","title":"Create a conversation from an instruction and a response","text":"from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n "},{"location":"components-gallery/steps/formattextgenerationdpo/","title":"FormatTextGenerationDPO","text":"Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings , so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook . "},{"location":"components-gallery/steps/formattextgenerationdpo/#note","title":"Note","text":"The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. "},{"location":"components-gallery/steps/formattextgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generations]\n ICOL3[generation_models]\n ICOL4[ratings]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[chosen]\n OCOL3[chosen_model]\n OCOL4[chosen_rating]\n OCOL5[rejected]\n OCOL6[rejected_model]\n OCOL7[rejected_rating]\n end\n end\n\n subgraph FormatTextGenerationDPO\n StepInput[Input Columns: system_prompt, instruction, generations, generation_models, ratings]\n StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n ICOL4 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepOutput --> OCOL5\n StepOutput --> OCOL6\n StepOutput --> OCOL7\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formattextgenerationdpo/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generations , if available. -
instruction (str ): The instruction used to generate the generations with the LLM . -
generations (List[str] ): The generations produced by the LLM . -
generation_models (List[str] , optional): The model names used to generate the generations , only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. -
ratings (List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . "},{"location":"components-gallery/steps/formattextgenerationdpo/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generations with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
chosen (List[Dict[str, str]] ): The chosen generation based on the ratings . -
chosen_model (str , optional): The model name used to generate the chosen generation, if the generation_models are available. -
chosen_rating (float ): The rating of the chosen generation. -
rejected (List[Dict[str, str]] ): The rejected generation based on the ratings . -
rejected_model (str , optional): The model name used to generate the rejected generation, if the generation_models are available. -
rejected_rating (float ): The rating of the rejected generation. "},{"location":"components-gallery/steps/formattextgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# { 'instruction': \"What's 2+2?\",\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n "},{"location":"components-gallery/steps/formatchatgenerationdpo/","title":"FormatChatGenerationDPO","text":"Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook ., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . "},{"location":"components-gallery/steps/formatchatgenerationdpo/#note","title":"Note","text":"The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. "},{"location":"components-gallery/steps/formatchatgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[messages]\n ICOL1[generations]\n ICOL2[generation_models]\n ICOL3[ratings]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[chosen]\n OCOL3[chosen_model]\n OCOL4[chosen_rating]\n OCOL5[rejected]\n OCOL6[rejected_model]\n OCOL7[rejected_rating]\n end\n end\n\n subgraph FormatChatGenerationDPO\n StepInput[Input Columns: messages, generations, generation_models, ratings]\n StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepOutput --> OCOL5\n StepOutput --> OCOL6\n StepOutput --> OCOL7\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatchatgenerationdpo/#inputs","title":"Inputs","text":" -
messages (List[Dict[str, str]] ): The conversation messages. -
generations (List[str] ): The generations produced by the LLM . -
generation_models (List[str] , optional): The model names used to generate the generations , only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. -
ratings (List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . "},{"location":"components-gallery/steps/formatchatgenerationdpo/#outputs","title":"Outputs","text":" -
prompt (str ): The user message used to generate the generations with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
chosen (List[Dict[str, str]] ): The chosen generation based on the ratings . -
chosen_model (str , optional): The model name used to generate the chosen generation, if the generation_models are available. -
chosen_rating (float ): The rating of the chosen generation. -
rejected (List[Dict[str, str]] ): The rejected generation based on the ratings . -
rejected_model (str , optional): The model name used to generate the rejected generation, if the generation_models are available. -
rejected_rating (float ): The rating of the rejected generation. "},{"location":"components-gallery/steps/formatchatgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n "},{"location":"components-gallery/steps/formattextgenerationsft/","title":"FormatTextGenerationSFT","text":"Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. "},{"location":"components-gallery/steps/formattextgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generation]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[messages]\n end\n end\n\n subgraph FormatTextGenerationSFT\n StepInput[Input Columns: system_prompt, instruction, generation]\n StepOutput[Output Columns: prompt, prompt_id, messages]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formattextgenerationsft/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generation , if available. -
instruction (str ): The instruction used to generate the generation with the LLM . -
generation (str ): The generation produced by the LLM . "},{"location":"components-gallery/steps/formattextgenerationsft/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generation with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
messages (List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. "},{"location":"components-gallery/steps/formattextgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationsft/#format-your-dataset-for-sft-fine-tuning","title":"Format your dataset for SFT fine tuning","text":"from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'What's 2+2?',\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n# }\n# ]\n "},{"location":"components-gallery/steps/formatchatgenerationsft/","title":"FormatChatGenerationSFT","text":"Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. "},{"location":"components-gallery/steps/formatchatgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generation]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[messages]\n end\n end\n\n subgraph FormatChatGenerationSFT\n StepInput[Input Columns: system_prompt, instruction, generation]\n StepOutput[Output Columns: prompt, prompt_id, messages]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatchatgenerationsft/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generation , if available. -
instruction (str ): The instruction used to generate the generation with the LLM . -
generation (str ): The generation produced by the LLM . "},{"location":"components-gallery/steps/formatchatgenerationsft/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generation with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
messages (List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. "},{"location":"components-gallery/steps/formatchatgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationsft/#format-your-dataset-for-sft","title":"Format your dataset for SFT","text":"from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# }\n# ]\n "},{"location":"components-gallery/steps/deitafiltering/","title":"DeitaFiltering","text":"Filter dataset rows using DEITA filtering strategy. Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/steps/deitafiltering/#attributes","title":"Attributes","text":" -
data_budget: The desired size of the dataset after filtering. -
diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9 . -
normalize_embeddings: Whether to normalize the embeddings before computing the cosine distance. Defaults to True . "},{"location":"components-gallery/steps/deitafiltering/#runtime-parameters","title":"Runtime Parameters","text":" -
data_budget: The desired size of the dataset after filtering. -
diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. "},{"location":"components-gallery/steps/deitafiltering/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[evol_instruction_score]\n ICOL1[evol_response_score]\n ICOL2[embedding]\n end\n subgraph New columns\n OCOL0[deita_score]\n OCOL1[deita_score_computed_with]\n OCOL2[nearest_neighbor_distance]\n end\n end\n\n subgraph DeitaFiltering\n StepInput[Input Columns: evol_instruction_score, evol_response_score, embedding]\n StepOutput[Output Columns: deita_score, deita_score_computed_with, nearest_neighbor_distance]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/deitafiltering/#inputs","title":"Inputs","text":" -
evol_instruction_score (float ): The score of the instruction generated by ComplexityScorer step. -
evol_response_score (float ): The score of the response generated by QualityScorer step. -
embedding (List[float] ): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step. "},{"location":"components-gallery/steps/deitafiltering/#outputs","title":"Outputs","text":" -
deita_score (float ): The DEITA score for the instruction-response pair. -
deita_score_computed_with (List[str] ): The scores used to compute the DEITA score. -
nearest_neighbor_distance (float ): The cosine distance between the embeddings of the instruction-response pair. "},{"location":"components-gallery/steps/deitafiltering/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/deitafiltering/#filter-the-dataset-based-on-the-deita-score-and-the-cosine-distance-between-the-embeddings","title":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings","text":"from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n "},{"location":"components-gallery/steps/deitafiltering/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/steps/embeddingdedup/","title":"EmbeddingDedup","text":"Deduplicates text using embeddings. EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour ) using the nn_indices and nn_scores , determine the texts that are duplicate. "},{"location":"components-gallery/steps/embeddingdedup/#attributes","title":"Attributes","text":" - threshold: the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of
0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9 . Runtime Parameters: - threshold : the threshold to consider 2 examples as duplicates. "},{"location":"components-gallery/steps/embeddingdedup/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[nn_indices]\n ICOL1[nn_scores]\n end\n subgraph New columns\n OCOL0[keep_row_after_embedding_filtering]\n end\n end\n\n subgraph EmbeddingDedup\n StepInput[Input Columns: nn_indices, nn_scores]\n StepOutput[Output Columns: keep_row_after_embedding_filtering]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/embeddingdedup/#inputs","title":"Inputs","text":" -
nn_indices (List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. -
nn_scores (List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. "},{"location":"components-gallery/steps/embeddingdedup/#outputs","title":"Outputs","text":" - keep_row_after_embedding_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. "},{"location":"components-gallery/steps/embeddingdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddingdedup/#deduplicate-a-list-of-texts-using-embedding-information","title":"Deduplicate a list of texts using embedding information","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n "},{"location":"components-gallery/steps/apigenexecutionchecker/","title":"APIGenExecutionChecker","text":"Executes the generated function calls. This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath , which is a string pointing to a python .py file with functions). "},{"location":"components-gallery/steps/apigenexecutionchecker/#attributes","title":"Attributes","text":" -
libpath: The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename. -
check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True. "},{"location":"components-gallery/steps/apigenexecutionchecker/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[answers]\n end\n subgraph New columns\n OCOL0[keep_row_after_execution_check]\n OCOL1[execution_result]\n end\n end\n\n subgraph APIGenExecutionChecker\n StepInput[Input Columns: answers]\n StepOutput[Output Columns: keep_row_after_execution_check, execution_result]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/apigenexecutionchecker/#inputs","title":"Inputs","text":" - answers (
str ): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads . "},{"location":"components-gallery/steps/apigenexecutionchecker/#outputs","title":"Outputs","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#execute-a-function-from-a-given-library-with-the-answer-from-an-llm","title":"Execute a function from a given library with the answer from an LLM","text":"from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n "},{"location":"components-gallery/steps/apigenexecutionchecker/#references","title":"References","text":""},{"location":"components-gallery/steps/minhashdedup/","title":"MinHashDedup","text":"Deduplicates text using MinHash and MinHashLSH . MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH . 4. Check if the MinHash is already in the LSH , if so, it is a duplicate. "},{"location":"components-gallery/steps/minhashdedup/#attributes","title":"Attributes","text":" -
num_perm: the number of permutations to use. Defaults to 128 . -
seed: the seed to use for the MinHash. Defaults to 1 . -
tokenizer: the tokenizer to use. Available ones are words or ngrams . If words is selected, it tokenizes the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n ). Defaults to words . -
n: the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\" . Defaults to 5 . -
threshold: the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9 . -
storage: the storage to use for the LSH. Can be dict to store the index in memory, or disk . Keep in mind, disk is an experimental feature not defined in datasketch , that is based on DiskCache's Index class. It should work as a dict , but backed by disk, but depending on the system it can be slower. Defaults to dict . "},{"location":"components-gallery/steps/minhashdedup/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[keep_row_after_minhash_filtering]\n end\n end\n\n subgraph MinHashDedup\n StepInput[Input Columns: text]\n StepOutput[Output Columns: keep_row_after_minhash_filtering]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/minhashdedup/#inputs","title":"Inputs","text":" - text (
str ): the texts to be filtered. "},{"location":"components-gallery/steps/minhashdedup/#outputs","title":"Outputs","text":" - keep_row_after_minhash_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. "},{"location":"components-gallery/steps/minhashdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/minhashdedup/#deduplicate-a-list-of-texts-using-minhash-and-minhashlsh","title":"Deduplicate a list of texts using MinHash and MinHashLSH","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n "},{"location":"components-gallery/steps/minhashdedup/#references","title":"References","text":""},{"location":"components-gallery/steps/combineoutputs/","title":"CombineOutputs","text":"Combine the outputs of several upstream steps. CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph CombineOutputs\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/combineoutputs/#inputs","title":"Inputs","text":" - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#outputs","title":"Outputs","text":" - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/combineoutputs/#combine-dictionaries-of-a-dataset","title":"Combine dictionaries of a dataset","text":"from distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n)\n# [\n# {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n# {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n "},{"location":"components-gallery/steps/combineoutputs/#combine-upstream-steps-outputs-in-a-pipeline","title":"Combine upstream steps outputs in a pipeline","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n "},{"location":"components-gallery/steps/expandcolumns/","title":"ExpandColumns","text":"Expand columns that contain lists into multiple rows. ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list. "},{"location":"components-gallery/steps/expandcolumns/#attributes","title":"Attributes","text":" -
columns: A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name. -
encoded: A bool to inform Whether the columns are JSON encoded lists. If this value is set to True, the columns will be decoded before expanding. Alternatively, to specify columns that can be encoded, a list can be provided. In this case, the column names informed must be a subset of the columns selected for expansion. -
split_statistics: A bool to inform whether the statistics in the distilabel_metadata column should be split into multiple rows. If we want to expand some columns containing a list of strings that come from having parsed the output of an LLM, the tokens in the statistics_{step_name} of the distilabel_metadata column should be splitted to avoid multiplying them if we aggregate the data afterwards. For example, with a task that is supposed to generate a list of N instructions, and we want each of those N instructions in different rows, we should split the statistics by N. In such a case, set this value to True. "},{"location":"components-gallery/steps/expandcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph ExpandColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/expandcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to be expanded into multiple rows. "},{"location":"components-gallery/steps/expandcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns attribute): The expanded columns. "},{"location":"components-gallery/steps/expandcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-into-multiple-rows","title":"Expand the selected columns into multiple rows","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n "},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-which-are-json-encoded-into-multiple-rows","title":"Expand the selected columns which are JSON encoded into multiple rows","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n "},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-and-split-the-statistics-in-the-distilabel_metadata-column","title":"Expand the selected columns and split the statistics in the distilabel_metadata column","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n "},{"location":"components-gallery/steps/groupcolumns/","title":"GroupColumns","text":"Combines columns from a list of StepInput . GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput . Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs , respectively. "},{"location":"components-gallery/steps/groupcolumns/#attributes","title":"Attributes","text":""},{"location":"components-gallery/steps/groupcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph GroupColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/groupcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to group. "},{"location":"components-gallery/steps/groupcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns and output_columns attributes): The columns that were grouped. "},{"location":"components-gallery/steps/groupcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/groupcolumns/#group-columns-of-a-dataset","title":"Group columns of a dataset","text":"from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n "},{"location":"components-gallery/steps/groupcolumns/#specify-the-name-of-the-output-columns","title":"Specify the name of the output columns","text":"from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n "},{"location":"components-gallery/steps/keepcolumns/","title":"KeepColumns","text":"Keeps selected columns in the dataset. KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs . "},{"location":"components-gallery/steps/keepcolumns/#note","title":"Note","text":"The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable. "},{"location":"components-gallery/steps/keepcolumns/#attributes","title":"Attributes","text":" - columns: List of strings with the names of the columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph KeepColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/keepcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to keep. "},{"location":"components-gallery/steps/keepcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns attribute): The columns that were kept. "},{"location":"components-gallery/steps/keepcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/keepcolumns/#select-the-columns-to-keep","title":"Select the columns to keep","text":"from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n "},{"location":"components-gallery/steps/mergecolumns/","title":"MergeColumns","text":"Merge columns from a row. MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput . MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column. This step can be useful if you have a `Task` that generates instructions for example, and you\nwant to have more examples of those. In such a case, you could for example use another `Task`\nto multiply your instructions synthetically, what would yield two different columns splitted.\nUsing `MergeColumns` you can merge them and use them as a single column in your dataset for\nfurther processing.\n "},{"location":"components-gallery/steps/mergecolumns/#attributes","title":"Attributes","text":""},{"location":"components-gallery/steps/mergecolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph MergeColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/mergecolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to merge. "},{"location":"components-gallery/steps/mergecolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns and output_column attributes): The columns that were merged. "},{"location":"components-gallery/steps/mergecolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/mergecolumns/#combine-columns-in-rows-of-a-dataset","title":"Combine columns in rows of a dataset","text":"from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n "},{"location":"components-gallery/steps/dbscan/","title":"DBSCAN","text":"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density. This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\nfrom `sklearn`. Visit `TextClustering` step for an example of use.\nThe trained model is saved as an artifact when creating a distiset\nand pushing it to the Hugging Face Hub.\n "},{"location":"components-gallery/steps/dbscan/#attributes","title":"Attributes","text":" - eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If
min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs: The number of parallel jobs to run. "},{"location":"components-gallery/steps/dbscan/#runtime-parameters","title":"Runtime Parameters","text":" -
eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. -
min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. -
metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. -
n_jobs: The number of parallel jobs to run. "},{"location":"components-gallery/steps/dbscan/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[projection]\n end\n subgraph New columns\n OCOL0[cluster_label]\n end\n end\n\n subgraph DBSCAN\n StepInput[Input Columns: projection]\n StepOutput[Output Columns: cluster_label]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/dbscan/#inputs","title":"Inputs","text":" - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. "},{"location":"components-gallery/steps/dbscan/#outputs","title":"Outputs","text":" - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. "},{"location":"components-gallery/steps/dbscan/#references","title":"References","text":" -
DBSCAN demo of sklearn -
sklearn dbscan "},{"location":"components-gallery/steps/umap/","title":"UMAP","text":"UMAP is a general purpose manifold learning and dimension reduction algorithm. This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. "},{"location":"components-gallery/steps/umap/#attributes","title":"Attributes","text":" - n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to
euclidean . - n_jobs: The number of parallel jobs to run. Defaults to 8 . - random_state: The random state to use for the UMAP algorithm. "},{"location":"components-gallery/steps/umap/#runtime-parameters","title":"Runtime Parameters","text":" -
n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. -
metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . -
n_jobs: The number of parallel jobs to run. Defaults to 8 . -
random_state: The random state to use for the UMAP algorithm. "},{"location":"components-gallery/steps/umap/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[embedding]\n end\n subgraph New columns\n OCOL0[projection]\n end\n end\n\n subgraph UMAP\n StepInput[Input Columns: embedding]\n StepOutput[Output Columns: projection]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/umap/#inputs","title":"Inputs","text":" - embedding (
List[float] ): The original embeddings we want to reduce the dimension. "},{"location":"components-gallery/steps/umap/#outputs","title":"Outputs","text":" - projection (
List[float] ): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components . "},{"location":"components-gallery/steps/umap/#references","title":"References","text":" -
UMAP repository -
UMAP documentation "},{"location":"components-gallery/steps/faissnearestneighbour/","title":"FaissNearestNeighbour","text":"Create a faiss index to get the nearest neighbours. FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row. "},{"location":"components-gallery/steps/faissnearestneighbour/#attributes","title":"Attributes","text":" -
device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . -
string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . -
metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . -
k: the number of nearest neighbours to search for each input row. Defaults to 1 . -
search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . -
train_size: If the index needs a training step, specifies how many vectors will be used to train the index. "},{"location":"components-gallery/steps/faissnearestneighbour/#runtime-parameters","title":"Runtime Parameters","text":" -
device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . -
string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . -
metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . -
k: the number of nearest neighbours to search for each input row. Defaults to 1 . -
search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . -
train_size: If the index needs a training step, specifies how many vectors will be used to train the index. "},{"location":"components-gallery/steps/faissnearestneighbour/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[embedding]\n end\n subgraph New columns\n OCOL0[nn_indices]\n OCOL1[nn_scores]\n end\n end\n\n subgraph FaissNearestNeighbour\n StepInput[Input Columns: embedding]\n StepOutput[Output Columns: nn_indices, nn_scores]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/faissnearestneighbour/#inputs","title":"Inputs","text":" - embedding (
List[Union[float, int]] ): a sentence embedding. "},{"location":"components-gallery/steps/faissnearestneighbour/#outputs","title":"Outputs","text":" -
nn_indices (List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. -
nn_scores (List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. "},{"location":"components-gallery/steps/faissnearestneighbour/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/faissnearestneighbour/#generating-embeddings-and-getting-the-nearest-neighbours","title":"Generating embeddings and getting the nearest neighbours","text":"from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n "},{"location":"components-gallery/steps/faissnearestneighbour/#references","title":"References","text":""},{"location":"components-gallery/steps/embeddinggeneration/","title":"EmbeddingGeneration","text":"Generate embeddings using an Embeddings model. EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts. "},{"location":"components-gallery/steps/embeddinggeneration/#attributes","title":"Attributes","text":" - embeddings: the
Embeddings model used to generate the sentence embeddings. "},{"location":"components-gallery/steps/embeddinggeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[embedding]\n end\n end\n\n subgraph EmbeddingGeneration\n StepInput[Input Columns: text]\n StepOutput[Output Columns: embedding]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/embeddinggeneration/#inputs","title":"Inputs","text":" - text (
str ): The text for which the sentence embedding has to be generated. "},{"location":"components-gallery/steps/embeddinggeneration/#outputs","title":"Outputs","text":" - embedding (
List[Union[float, int]] ): the generated sentence embedding. "},{"location":"components-gallery/steps/embeddinggeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddinggeneration/#generate-sentence-embeddings-with-sentence-transformers","title":"Generate sentence embeddings with Sentence Transformers","text":"from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n "},{"location":"components-gallery/steps/rewardmodelscore/","title":"RewardModelScore","text":"Assign a score to a response using a Reward Model. RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers , assigns an score to a response generated for an instruction, or a score to a multi-turn conversation. "},{"location":"components-gallery/steps/rewardmodelscore/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
truncation: whether to truncate sequences at the maximum length. Defaults to False . -
max_length: maximun length to use for padding or truncation. Defaults to None . "},{"location":"components-gallery/steps/rewardmodelscore/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n ICOL2[conversation]\n end\n subgraph New columns\n OCOL0[score]\n end\n end\n\n subgraph RewardModelScore\n StepInput[Input Columns: instruction, response, conversation]\n StepOutput[Output Columns: score]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/rewardmodelscore/#inputs","title":"Inputs","text":" -
instruction (str , optional): the instruction used to generate a response . If provided, then response must be provided too. -
response (str , optional): the response generated for instruction . If provided, then instruction must be provide too. -
conversation (ChatType , optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided. "},{"location":"components-gallery/steps/rewardmodelscore/#outputs","title":"Outputs","text":" - score (
float ): the score given by the reward model for the instruction-response pair or the conversation. "},{"location":"components-gallery/steps/rewardmodelscore/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/rewardmodelscore/#response-pair","title":"response pair","text":"from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n)\n# [\n# {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n# {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n "},{"location":"components-gallery/steps/rewardmodelscore/#turn-conversation","title":"turn conversation","text":"from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n)\n# [\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n "},{"location":"components-gallery/steps/formatprm/","title":"FormatPRM","text":"Helper step to transform the data into the format expected by the PRM model. This step can be used to format the data in one of 2 formats: Following the format presented in peiyi9979/Math-Shepherd, in which case this step creates the columns input and label, where the input is the instruction with the solution (and the tag replaced by a token), and the label is the instruction with the solution, both separated by a newline. Following TRL's format for training, which generates the columns prompt, completions, and labels. The labels correspond to the original tags replaced by boolean values, where True represents correct steps. "},{"location":"components-gallery/steps/formatprm/#attributes","title":"Attributes","text":" -
format: The format to use for the PRM model. \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format prepared to train the model using TRL. -
step_token: String that serves as a unique token denoting the position for predicting the step score. -
tags: List of tags that represent the correct and incorrect steps. This only needs to be informed if it's different than the default in MathShepherdCompleter . "},{"location":"components-gallery/steps/formatprm/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[solutions]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[label]\n OCOL2[prompt]\n OCOL3[completions]\n OCOL4[labels]\n end\n end\n\n subgraph FormatPRM\n StepInput[Input Columns: instruction, solutions]\n StepOutput[Output Columns: input, label, prompt, completions, labels]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatprm/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/formatprm/#outputs","title":"Outputs","text":" -
input (str ): The instruction with the solutions, where the label tags are replaced by a token. -
label (str ): The instruction with the solutions. -
prompt (str ): The instruction with the solutions, where the label tags are replaced by a token. -
completions (List[str] ): The solution represented as a list of steps. -
labels (List[bool] ): The labels, as a list of booleans, where True represents a good response. "},{"location":"components-gallery/steps/formatprm/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatprm/#shepherd-format","title":"Shepherd format","text":"from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM()\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\nresult = next(formatter.process(result))\n "},{"location":"components-gallery/steps/formatprm/#prepare-your-data-to-train-a-prm-model-with-the-trl-format","title":"Prepare your data to train a PRM model with the TRL format","text":"from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM(format=\"trl\")\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\n\nresult = next(formatter.process(result))\n# {\n# \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"solutions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n# ],\n# \"prompt\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"completions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n# ],\n# \"labels\": [\n# true,\n# true,\n# true\n# ]\n# }\n "},{"location":"components-gallery/steps/formatprm/#references","title":"References","text":""},{"location":"components-gallery/steps/truncatetextcolumn/","title":"TruncateTextColumn","text":"Truncate a row using a tokenizer or the number of characters. TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length. "},{"location":"components-gallery/steps/truncatetextcolumn/#attributes","title":"Attributes","text":" -
column: the column to truncate. Defaults to \"text\" . -
max_length: the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192 . -
tokenizer: the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None . "},{"location":"components-gallery/steps/truncatetextcolumn/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph TruncateTextColumn\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/truncatetextcolumn/#inputs","title":"Inputs","text":" - dynamic (determined by
column attribute): The columns to be truncated, defaults to \"text\". "},{"location":"components-gallery/steps/truncatetextcolumn/#outputs","title":"Outputs","text":" - dynamic (determined by
column attribute): The truncated column. "},{"location":"components-gallery/steps/truncatetextcolumn/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-tokens","title":"Truncating a row to a given number of tokens","text":"from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a sample'}]\n "},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-characters","title":"Truncating a row to a given number of characters","text":"from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a '}]\n "},{"location":"components-gallery/tasks/","title":"Tasks Gallery","text":"Category Overview The gallery page showcases the different types of components within distilabel . Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data. labelling Labelling steps are used to label the data. -
APIGenGenerator Generate queries and answers for the given functions in JSON format. APIGenGenerator -
Genstruct Generate a pair of instruction-response from a document using an LLM . Genstruct -
Magpie Generates conversations using an instruct fine-tuned LLM. Magpie -
MathShepherdCompleter Math Shepherd Completer and auto-labeller task. MathShepherdCompleter -
MathShepherdGenerator Math Shepherd solution generator. MathShepherdGenerator -
SelfInstruct Generate instructions based on a given input using an LLM . SelfInstruct -
TextGeneration Text generation with an LLM given a prompt. TextGeneration -
TextGenerationWithImage Text generation with images with an LLM given a prompt. TextGenerationWithImage -
URIAL Generates a response using a non-instruct fine-tuned model. URIAL -
MagpieGenerator Generator task the generates instructions or conversations using Magpie. MagpieGenerator -
ChatGeneration Generates text based on a conversation. ChatGeneration -
ArgillaLabeller Annotate Argilla records based on input fields, example records and question settings. ArgillaLabeller -
TextClassification Classifies text into one or more categories or labels. TextClassification -
EvolInstruct Evolve instructions using an LLM . EvolInstruct -
EvolComplexity Evolve instructions to make them more complex using an LLM . EvolComplexity -
EvolQuality Evolve the quality of the responses using an LLM . EvolQuality -
EvolInstructGenerator Generate evolved instructions using an LLM . EvolInstructGenerator -
EvolComplexityGenerator Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator -
InstructionBacktranslation Self-Alignment with Instruction Backtranslation. InstructionBacktranslation -
PrometheusEval Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval -
ComplexityScorer Score instructions based on their complexity using an LLM . ComplexityScorer -
QualityScorer Score responses based on their quality using an LLM . QualityScorer -
CLAIR Contrastive Learning from AI Revisions (CLAIR). CLAIR -
UltraFeedback Rank generations focusing on different aspects using an LLM . UltraFeedback -
PairRM Rank the candidates based on the input using the LLM model. PairRM -
GenerateSentencePair Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair -
GenerateEmbeddings Generate embeddings using the last hidden state of an LLM . GenerateEmbeddings -
TextClustering Task that clusters a set of texts and generates summary labels for each cluster. TextClustering -
TextClustering Task that clusters a set of texts and generates summary labels for each cluster. TextClustering -
APIGenSemanticChecker Generate queries and answers for the given functions in JSON format. APIGenSemanticChecker -
GenerateTextRetrievalData Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData -
GenerateShortTextMatchingData Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData -
GenerateLongTextMatchingData Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData -
GenerateTextClassificationData Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData -
StructuredGeneration Generate structured content for a given instruction using an LLM . StructuredGeneration -
MonolingualTripletGenerator Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator -
BitextRetrievalGenerator Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator -
EmbeddingTaskGenerator Generate task descriptions for embedding-related tasks using an LLM . EmbeddingTaskGenerator "},{"location":"components-gallery/tasks/apigengenerator/","title":"APIGenGenerator","text":"Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. "},{"location":"components-gallery/tasks/apigengenerator/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to guide the user in the generation of queries and answers. -
use_tools: Whether to use the tools available in the prompt to generate the queries and answers. In case the tools are given in the input, they will be added to the prompt. -
number: The number of queries to generate. It can be a list, where each number will be chosen randomly, or a dictionary with the number of queries and the probability of each. I.e: number=1 , number=[1, 2, 3] , number={1: 0.5, 2: 0.3, 3: 0.2} are all valid inputs. It corresponds to the number of parallel queries to generate. -
use_default_structured_output: Whether to use the default structured output or not. "},{"location":"components-gallery/tasks/apigengenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[examples]\n ICOL1[func_name]\n ICOL2[func_desc]\n ICOL3[tools]\n end\n subgraph New columns\n OCOL0[query]\n OCOL1[answers]\n end\n end\n\n subgraph APIGenGenerator\n StepInput[Input Columns: examples, func_name, func_desc, tools]\n StepOutput[Output Columns: query, answers]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/apigengenerator/#inputs","title":"Inputs","text":" -
examples (str ): Examples used as few shots to guide the model. -
func_name (str ): Name for the function to generate. -
func_desc (str ): Description of what the function should do. -
tools (str ): JSON formatted string containing the tool representation of the function. "},{"location":"components-gallery/tasks/apigengenerator/#outputs","title":"Outputs","text":" -
query (str ): The list of queries. -
answers (str ): JSON formatted string with the list of answers, containing the info as a dictionary to be passed to the functions. "},{"location":"components-gallery/tasks/apigengenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigengenerator/#generate-without-structured-output-original-implementation","title":"Generate without structured output (original implementation)","text":"from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\napigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n)\napigen.load()\n\nres = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n)\nres\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n# 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n# 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n# [{'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n# 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n# {'role': 'user',\n# 'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\n "},{"location":"components-gallery/tasks/apigengenerator/#generate-with-structured-output","title":"Generate with structured output","text":"from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\napigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n)\napigen.load()\n\nres_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n)\nres_struct\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n# \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n# 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n# [{'arguments': {}, 'name': 'getrandommovie'}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n# 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n# {'role': 'user',\n# 'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigengenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/genstruct/","title":"Genstruct","text":"Generate a pair of instruction-response from a document using an LLM . Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper. "},{"location":"components-gallery/tasks/genstruct/#note","title":"Note","text":"The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task. "},{"location":"components-gallery/tasks/genstruct/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/genstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[title]\n ICOL1[content]\n end\n subgraph New columns\n OCOL0[user]\n OCOL1[assistant]\n OCOL2[model_name]\n end\n end\n\n subgraph Genstruct\n StepInput[Input Columns: title, content]\n StepOutput[Output Columns: user, assistant, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/genstruct/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/genstruct/#outputs","title":"Outputs","text":" -
user (str ): The user's instruction based on the document. -
assistant (str ): The assistant's response based on the user's instruction. -
model_name (str ): The model name used to generate the feedback and result . "},{"location":"components-gallery/tasks/genstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/genstruct/#generate-instructions-from-raw-documents-using-the-title-and-content","title":"Generate instructions from raw documents using the title and content","text":"from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n)\n\ngenstruct.load()\n\nresult = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'title': 'An instruction',\n# 'content': 'content of the document',\n# 'model_name': 'test',\n# 'user': 'An instruction',\n# 'assistant': 'content of the document',\n# }\n# ]\n "},{"location":"components-gallery/tasks/genstruct/#references","title":"References","text":""},{"location":"components-gallery/tasks/magpie/","title":"Magpie","text":"Generates conversations using an instruct fine-tuned LLM. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. "},{"location":"components-gallery/tasks/magpie/#attributes","title":"Attributes","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . "},{"location":"components-gallery/tasks/magpie/#runtime-parameters","title":"Runtime Parameters","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. "},{"location":"components-gallery/tasks/magpie/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n end\n subgraph New columns\n OCOL0[conversation]\n OCOL1[instruction]\n OCOL2[response]\n OCOL3[system_prompt_key]\n OCOL4[model_name]\n end\n end\n\n subgraph Magpie\n StepInput[Input Columns: system_prompt]\n StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/magpie/#inputs","title":"Inputs","text":" - system_prompt (
str , optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic. "},{"location":"components-gallery/tasks/magpie/#outputs","title":"Outputs","text":" -
conversation (ChatType ): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False . -
instruction (str ): the generated instructions if only_instruction=True or n_turns==1 . -
response (str ): the generated response if n_turns==1 . -
system_prompt_key (str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. -
model_name (str ): The model name used to generate the conversation or instruction . "},{"location":"components-gallery/tasks/magpie/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpie/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n# {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n "},{"location":"components-gallery/tasks/magpie/#generating-conversations-with-llama-3-8b-instruct-and-transformersllm","title":"Generating conversations with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n# {\n# 'role': 'user',\n# 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n# {\n# 'role': 'user',\n# 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n# }\n# ]\n# }\n# ]\n "},{"location":"components-gallery/tasks/magpie/#references","title":"References","text":" - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/mathshepherdcompleter/","title":"MathShepherdCompleter","text":"Math Shepherd Completer and auto-labeller task. This task is in charge of, given a list of solutions to an instruction, and a golden solution, as reference, generate completions for the solutions, and label them according to the golden solution using the hard estimation method from figure 2 in the reference paper, Eq. 3. The attributes make the task flexible to be used with different types of dataset and LLMs, and allow making use of different fields to modify the system and user prompts for it. Before modifying them, review the current defaults to ensure the completions are generated correctly. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. -
extra_rules: This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. -
few_shots: Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. -
N: Number of completions to generate for each step, correspond to N in the paper. They used 8 in the paper, but it can be adjusted. -
tags: List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the paper, where the first is used as a positive label, and the second as a negative one. This can be updated, but it MUST be a list with 2 elements, where the first is the positive one, and the second the negative one. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[solutions]\n ICOL2[golden_solution]\n end\n subgraph New columns\n OCOL0[solutions]\n OCOL1[model_name]\n end\n end\n\n subgraph MathShepherdCompleter\n StepInput[Input Columns: instruction, solutions, golden_solution]\n StepOutput[Output Columns: solutions, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#inputs","title":"Inputs","text":" -
instruction (str ): The task or instruction. -
solutions (List[str] ): List of solutions to the task. -
golden_solution (str ): The reference solution to the task, will be used to annotate the candidate solutions. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#outputs","title":"Outputs","text":" -
solutions (List[str] ): The same columns that were used as input, the \"solutions\" is modified. -
model_name (str ): The name of the model used to generate the revision. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/mathshepherdcompleter/#annotate-your-steps-with-the-math-shepherd-completer-using-the-structured-outputs-the-preferred-way","title":"Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way)","text":"from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#annotate-your-steps-with-the-math-shepherd-completer","title":"Annotate your steps with the Math Shepherd Completer","text":"from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#references","title":"References","text":" - Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations
"},{"location":"components-gallery/tasks/mathshepherdgenerator/","title":"MathShepherdGenerator","text":"Math Shepherd solution generator. This task is in charge of generating completions for a given instruction, in the format expected by the Math Shepherd Completer task. The attributes make the task flexible to be used with different types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented in the original paper. Before modifying them, review the current defaults to ensure the completions are generated correctly. This task can be used to generate the golden solutions for a given problem if not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer. Only one of solutions or golden_solution will be generated, depending on the value of M. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. Take into account that the system prompt includes 2 variables in the Jinja2 template, {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example to steer the model towards a specific type of responses, and few shots to add examples. They can be modified to adapt the system prompt to the dataset and model used without needing to change the full system prompt. -
extra_rules: This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. -
few_shots: Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. -
M: Number of completions to generate for each step. By default is set to 1, which will generate the \"golden_solution\". In this case select a stronger model, as it will be used as the source of true during labelling. If M is set to a number greater than 1, the task will generate a list of completions to be labeled by the Math Shepherd Completer task. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[golden_solution]\n OCOL1[solutions]\n OCOL2[model_name]\n end\n end\n\n subgraph MathShepherdGenerator\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: golden_solution, solutions, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#inputs","title":"Inputs","text":" - instruction (
str ): The task or instruction. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#outputs","title":"Outputs","text":" -
golden_solution (str ): The step by step solution to the instruction. It will be generated if M is equal to 1. -
solutions (List[List[str]] ): A list of possible solutions to the instruction. It will be generated if M is greater than 1. -
model_name (str ): The name of the model used to generate the revision. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/mathshepherdgenerator/#generate-the-solution-for-a-given-instruction-prefer-a-stronger-model-here","title":"Generate the solution for a given instruction (prefer a stronger model here)","text":"from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"]'}]]\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#generate-m-completions-for-a-given-instruction-using-structured-output-generation","title":"Generate M completions for a given instruction (using structured output generation)","text":"from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n)\ntask = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#references","title":"References","text":" - Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations
"},{"location":"components-gallery/tasks/selfinstruct/","title":"SelfInstruct","text":"Generate instructions based on a given input using an LLM . SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\". "},{"location":"components-gallery/tasks/selfinstruct/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. Defaults to 5. -
criteria_for_query_generation: The criteria for the query generation. Defaults to the criteria defined within the paper. -
application_description: The description of the AI application that one want to build with these instructions. Defaults to AI assistant . "},{"location":"components-gallery/tasks/selfinstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[input]\n end\n subgraph New columns\n OCOL0[instructions]\n OCOL1[model_name]\n end\n end\n\n subgraph SelfInstruct\n StepInput[Input Columns: input]\n StepOutput[Output Columns: instructions, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/selfinstruct/#inputs","title":"Inputs","text":" - input (
str ): The input to generate the instructions. It's also called seed in the paper. "},{"location":"components-gallery/tasks/selfinstruct/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/selfinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/selfinstruct/#generate-instructions-based-on-a-given-input","title":"Generate instructions based on a given input","text":"from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n# {\n# 'input': 'instruction',\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/","title":"TextGeneration","text":"Text generation with an LLM given a prompt. TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM. "},{"location":"components-gallery/tasks/textgeneration/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None . -
template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. -
columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . -
use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. "},{"location":"components-gallery/tasks/textgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph TextGeneration\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textgeneration/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a List[str] to be used in the template. "},{"location":"components-gallery/tasks/textgeneration/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgeneration/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'your instruction',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'generation',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#use-a-custom-template-to-generate-text","title":"Use a custom template to generate text","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n# 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#few-shot-learning-with-different-system-prompts","title":"Few shot learning with different system prompts","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'examples': ['This is an example', 'Another relevant example'],\n# 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'Disable the firewall on the router',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#references","title":"References","text":" - Jinja2 Template Designer Documentation
"},{"location":"components-gallery/tasks/textgenerationwithimage/","title":"TextGenerationWithImage","text":"Text generation with images with an LLM given a prompt. TextGenerationWithImage is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. Additionally, an image column is expected containing one of the url, base64 encoded image or PIL image. This task inherits from TextGeneration , so all the functionality available in that task related to the prompt will be available here too. "},{"location":"components-gallery/tasks/textgenerationwithimage/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to use in the generation. If not, then no system prompt will be used. Defaults to None . -
template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. -
columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . -
image_type: The type of the image provided, this will be used to preprocess if necessary. Must be one of \"url\", \"base64\" or \"PIL\". "},{"location":"components-gallery/tasks/textgenerationwithimage/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph TextGenerationWithImage\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a list[str] to be used in the template. "},{"location":"components-gallery/tasks/textgenerationwithimage/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textgenerationwithimage/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgenerationwithimage/#answer-questions-from-an-image","title":"Answer questions from an image","text":"from distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"What\u2019s in this image?\",\n# \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n# \"generation\": \"Based on the visual cues in the image...\",\n# \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n# ... # distilabel_metadata would be here\n# }\n# ]\n# result[0][\"generation\"]\n# \"Based on the visual cues in the image, here are some possible story points:\n\n* The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.\n\nAnalysis and Ideas:\n* The abundance of green grass and trees suggests a healthy ecosystem or habitat.\n* The presence of wildlife, such as birds or deer, is possible based on the surroundings.\n* A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest.\n\nAdditional Questions to Ask:\n* Why is a footbridge present in this area?\n* What kind of wildlife inhabits this region\"\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#answer-questions-from-an-image-stored-as-base64","title":"Answer questions from an image stored as base64","text":"# For this example we will assume that we have the string representation of the image\n# stored, but will just take the image and transform it to base64 to ilustrate the example.\nimport requests\nimport base64\n\nimage_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimg = requests.get(image_url).content\nbase64_image = base64.b64encode(img).decode(\"utf-8\")\n\nfrom distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#references","title":"References","text":""},{"location":"components-gallery/tasks/urial/","title":"URIAL","text":"Generates a response using a non-instruct fine-tuned model. URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input. "},{"location":"components-gallery/tasks/urial/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[conversation]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph URIAL\n StepInput[Input Columns: instruction, conversation]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/urial/#inputs","title":"Inputs","text":" -
instruction (str , optional): The instruction to generate a response from. -
conversation (List[Dict[str, str]] , optional): The conversation to generate a response from (the last message must be from the user). "},{"location":"components-gallery/tasks/urial/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/urial/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/urial/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n)\n\nstep.load()\n\nresults = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n# {\n# 'instruction': \"What's the most most common type of cloud?\",\n# 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n# 'distilabel_metadata': {...},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n# }\n# ]\n "},{"location":"components-gallery/tasks/urial/#references","title":"References","text":" - The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
"},{"location":"components-gallery/tasks/magpiegenerator/","title":"MagpieGenerator","text":"Generator task the generates instructions or conversations using Magpie. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. "},{"location":"components-gallery/tasks/magpiegenerator/#attributes","title":"Attributes","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . -
num_rows: the number of rows to be generated. "},{"location":"components-gallery/tasks/magpiegenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. -
num_rows: the number of rows to be generated. "},{"location":"components-gallery/tasks/magpiegenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[conversation]\n OCOL1[instruction]\n OCOL2[response]\n OCOL3[system_prompt_key]\n OCOL4[model_name]\n end\n end\n\n subgraph MagpieGenerator\n StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n "},{"location":"components-gallery/tasks/magpiegenerator/#outputs","title":"Outputs","text":" -
conversation (ChatType ): the generated conversation which is a list of chat items with a role and a message. -
instruction (str ): the generated instructions if only_instruction=True . -
response (str ): the generated response if n_turns==1 . -
system_prompt_key (str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. -
model_name (str ): The model name used to generate the conversation or instruction . "},{"location":"components-gallery/tasks/magpiegenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpiegenerator/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n# {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n# ],\n# True\n# )\n "},{"location":"components-gallery/tasks/magpiegenerator/#generating-a-conversation-with-llama-3-8b-instruct-and-transformersllm","title":"Generating a conversation with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n# {\n# 'role': 'assistant',\n# 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n# },\n# {\n# 'role': 'user',\n# 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n# {\n# 'role': 'assistant',\n# 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n# },\n# {\n# 'role': 'user',\n# 'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n# }\n# ]\n# }\n# ],\n# True\n# )\n "},{"location":"components-gallery/tasks/magpiegenerator/#generating-with-system-prompts-with-probabilities","title":"Generating with system prompts with probabilities","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n "},{"location":"components-gallery/tasks/magpiegenerator/#references","title":"References","text":" - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/chatgeneration/","title":"ChatGeneration","text":"Generates text based on a conversation. ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it. "},{"location":"components-gallery/tasks/chatgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[messages]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph ChatGeneration\n StepInput[Input Columns: messages]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/chatgeneration/#inputs","title":"Inputs","text":" - messages (
List[Dict[Literal[\"role\", \"content\"], str]] ): The messages to generate the follow up completion from. "},{"location":"components-gallery/tasks/chatgeneration/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/chatgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/chatgeneration/#generate-text-from-a-conversation-in-openai-chat-format","title":"Generate text from a conversation in OpenAI chat format","text":"from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nchat.load()\n\nresult = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'generation': '4',\n# }\n# ]\n "},{"location":"components-gallery/tasks/argillalabeller/","title":"ArgillaLabeller","text":"Annotate Argilla records based on input fields, example records and question settings. This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation. "},{"location":"components-gallery/tasks/argillalabeller/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/argillalabeller/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[record]\n ICOL1[fields]\n ICOL2[question]\n ICOL3[example_records]\n ICOL4[guidelines]\n end\n subgraph New columns\n OCOL0[suggestion]\n end\n end\n\n subgraph ArgillaLabeller\n StepInput[Input Columns: record, fields, question, example_records, guidelines]\n StepOutput[Output Columns: suggestion]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n ICOL4 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/argillalabeller/#inputs","title":"Inputs","text":" -
record (argilla.Record ): The record to be annotated. -
fields (Optional[List[Dict[str, Any]]] ): The list of field settings for the input fields. -
question (Optional[Dict[str, Any]] ): The question settings for the question to be answered. -
example_records (Optional[List[Dict[str, Any]]] ): The few shot example records with responses to be used to answer the question. -
guidelines (Optional[str] ): The guidelines for the annotation task. "},{"location":"components-gallery/tasks/argillalabeller/#outputs","title":"Outputs","text":" - suggestion (
Dict[str, Any] ): The final suggestion for annotation. "},{"location":"components-gallery/tasks/argillalabeller/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-the-same-dataset-and-question","title":"Annotate a record with the same dataset and question","text":"import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n)\nexample_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n "},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-alternating-datasets-and-questions","title":"Annotate a record with alternating datasets and questions","text":"import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n "},{"location":"components-gallery/tasks/argillalabeller/#overwrite-default-prompts-and-instructions","title":"Overwrite default prompts and instructions","text":"import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n)\nlabeller.load()\n "},{"location":"components-gallery/tasks/argillalabeller/#references","title":"References","text":" - Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets
"},{"location":"components-gallery/tasks/textclassification/","title":"TextClassification","text":"Classifies text into one or more categories or labels. This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference. "},{"location":"components-gallery/tasks/textclassification/#attributes","title":"Attributes","text":" -
system_prompt: A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist. -
n: Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1. -
context: Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task. -
examples: List of examples to help the model understand the task, few shots. -
available_labels: List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions. -
default_label: Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1). "},{"location":"components-gallery/tasks/textclassification/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[labels]\n OCOL1[model_name]\n end\n end\n\n subgraph TextClassification\n StepInput[Input Columns: text]\n StepOutput[Output Columns: labels, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textclassification/#inputs","title":"Inputs","text":" - text (
str ): The reference text we want to obtain labels for. "},{"location":"components-gallery/tasks/textclassification/#outputs","title":"Outputs","text":" -
labels (Union[str, List[str]] ): The label or list of labels for the text. -
model_name (str ): The name of the model used to generate the label/s. "},{"location":"components-gallery/tasks/textclassification/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclassification/#assigning-a-sentiment-to-a-text","title":"Assigning a sentiment to a text","text":"from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#assigning-predefined-labels-with-specified-descriptions","title":"Assigning predefined labels with specified descriptions","text":"from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#free-multi-label-classification-without-predefined-labels","title":"Free multi label classification without predefined labels","text":"from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#references","title":"References","text":" - Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models
"},{"location":"components-gallery/tasks/evolinstruct/","title":"EvolInstruct","text":"Evolve instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions "},{"location":"components-gallery/tasks/evolinstruct/#attributes","title":"Attributes","text":" -
num_evolutions: The number of evolutions to be performed. -
store_evolutions: Whether to store all the evolutions or just the last one. Defaults to False . -
generate_answers: Whether to generate answers for the evolved instructions. Defaults to False . -
include_original_instruction: Whether to include the original instruction in the evolved_instructions output column. Defaults to False . -
mutation_templates: The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolinstruct/#runtime-parameters","title":"Runtime Parameters","text":" - seed: The seed to be set for
numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolinstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[evolved_instruction]\n OCOL1[evolved_instructions]\n OCOL2[model_name]\n OCOL3[answer]\n OCOL4[answers]\n end\n end\n\n subgraph EvolInstruct\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: evolved_instruction, evolved_instructions, model_name, answer, answers]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolinstruct/#inputs","title":"Inputs","text":" - instruction (
str ): The instruction to evolve. "},{"location":"components-gallery/tasks/evolinstruct/#outputs","title":"Outputs","text":" -
evolved_instruction (str ): The evolved instruction if store_evolutions=False . -
evolved_instructions (List[str] ): The evolved instructions if store_evolutions=True . -
model_name (str ): The name of the LLM used to evolve the instructions. -
answer (str ): The answer to the evolved instruction if generate_answers=True and store_evolutions=False . -
answers (List[str] ): The answers to the evolved instructions if generate_answers=True and store_evolutions=True . "},{"location":"components-gallery/tasks/evolinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstruct/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n "},{"location":"components-gallery/tasks/evolinstruct/#keep-the-iterations-of-the-evolutions","title":"Keep the iterations of the evolutions","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instructions': ['initial evolution', 'final evolution'],\n# 'model_name': 'model_name'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolinstruct/#generate-answers-for-the-instructions-in-a-single-step","title":"Generate answers for the instructions in a single step","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instruction': 'evolved instruction',\n# 'answer': 'answer to the instruction',\n# 'model_name': 'model_name'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolinstruct/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolcomplexity/","title":"EvolComplexity","text":"Evolve instructions to make them more complex using an LLM . EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach. "},{"location":"components-gallery/tasks/evolcomplexity/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolcomplexity/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The number of evolutions to be run. "},{"location":"components-gallery/tasks/evolcomplexity/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[evolved_instruction]\n OCOL1[answer]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolComplexity\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: evolved_instruction, answer, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolcomplexity/#inputs","title":"Inputs","text":" - instruction (
str ): The instruction to evolve. "},{"location":"components-gallery/tasks/evolcomplexity/#outputs","title":"Outputs","text":" -
evolved_instruction (str ): The evolved instruction. -
answer (str , optional): The answer to the instruction if generate_answers=True . -
model_name (str ): The name of the LLM used to evolve the instructions. "},{"location":"components-gallery/tasks/evolcomplexity/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexity/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n "},{"location":"components-gallery/tasks/evolcomplexity/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolquality/","title":"EvolQuality","text":"Evolve the quality of the responses using an LLM . EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/evolquality/#attributes","title":"Attributes","text":" -
num_evolutions: The number of evolutions to be performed on the responses. -
store_evolutions: Whether to store all the evolved responses or just the last one. Defaults to False . -
include_original_response: Whether to include the original response within the evolved responses. Defaults to False . -
mutation_templates: The mutation templates to be used to evolve the responses. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolquality/#runtime-parameters","title":"Runtime Parameters","text":" - seed: The seed to be set for
numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolquality/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n end\n subgraph New columns\n OCOL0[evolved_response]\n OCOL1[evolved_responses]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolQuality\n StepInput[Input Columns: instruction, response]\n StepOutput[Output Columns: evolved_response, evolved_responses, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolquality/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/evolquality/#outputs","title":"Outputs","text":" -
evolved_response (str ): The evolved response if store_evolutions=False . -
evolved_responses (List[str] ): The evolved responses if store_evolutions=True . -
model_name (str ): The name of the LLM used to evolve the responses. "},{"location":"components-gallery/tasks/evolquality/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolquality/#evolve-the-quality-of-the-responses-given-a-prompt","title":"Evolve the quality of the responses given a prompt","text":"from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'response': 'a response',\n# 'evolved_response': 'evolved response',\n# 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolquality/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/evolinstructgenerator/","title":"EvolInstructGenerator","text":"Generate evolved instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions "},{"location":"components-gallery/tasks/evolinstructgenerator/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolinstructgenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolinstructgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[instruction]\n OCOL1[answer]\n OCOL2[instructions]\n OCOL3[model_name]\n end\n end\n\n subgraph EvolInstructGenerator\n StepOutput[Output Columns: instruction, answer, instructions, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/evolinstructgenerator/#outputs","title":"Outputs","text":" -
instruction (str ): The generated instruction if generate_answers=False . -
answer (str ): The generated answer if generate_answers=True . -
instructions (List[str] ): The generated instructions if generate_answers=True . -
model_name (str ): The name of the LLM used to generate and evolve the instructions. "},{"location":"components-gallery/tasks/evolinstructgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstructgenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n "},{"location":"components-gallery/tasks/evolinstructgenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/","title":"EvolComplexityGenerator","text":"Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The number of evolutions to be run. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[instruction]\n OCOL1[answer]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolComplexityGenerator\n StepOutput[Output Columns: instruction, answer, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#outputs","title":"Outputs","text":" -
instruction (str ): The evolved instruction. -
answer (str , optional): The answer to the instruction if generate_answers=True . -
model_name (str ): The name of the LLM used to evolve the instructions. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/","title":"InstructionBacktranslation","text":"Self-Alignment with Instruction Backtranslation. "},{"location":"components-gallery/tasks/instructionbacktranslation/#attributes","title":"Attributes","text":" - _template: the Jinja2 template to use for the Instruction Backtranslation task.
"},{"location":"components-gallery/tasks/instructionbacktranslation/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n end\n subgraph New columns\n OCOL0[score]\n OCOL1[reason]\n OCOL2[model_name]\n end\n end\n\n subgraph InstructionBacktranslation\n StepInput[Input Columns: instruction, generation]\n StepOutput[Output Columns: score, reason, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/instructionbacktranslation/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#outputs","title":"Outputs","text":" -
score (str ): The score for the generation based on the given instruction. -
reason (str ): The reason for the provided score. -
model_name (str ): The model name used to score the generation. "},{"location":"components-gallery/tasks/instructionbacktranslation/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#generate-a-score-and-reason-for-a-given-instruction-and-generation","title":"Generate a score and reason for a given instruction and generation","text":"from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\ninstruction_backtranslation.load()\n\nresult = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"How much is 2+2?\",\n# \"generation\": \"4\",\n# \"score\": 3,\n# \"reason\": \"Reason for the generation.\",\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n# }\n# ]\n "},{"location":"components-gallery/tasks/instructionbacktranslation/#references","title":"References","text":" - Self-Alignment with Instruction Backtranslation
"},{"location":"components-gallery/tasks/prometheuseval/","title":"PrometheusEval","text":"Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness , harmlessness , honesty , factual-validity , and reasoning , that can be overridden via rubrics , and the selected rubric is set via the attribute rubric . "},{"location":"components-gallery/tasks/prometheuseval/#note","title":"Note","text":"The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too. "},{"location":"components-gallery/tasks/prometheuseval/#attributes","title":"Attributes","text":" -
mode: the evaluation mode to use, either absolute or relative . It defines whether the task will evaluate one or two generations. -
rubric: the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness , harmlessness , honesty , factual-validity , or reasoning . Those will only work if using the default rubrics , otherwise, the provided rubrics should be used. -
rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness , harmlessness , honesty , factual-validity , and reasoning . -
reference: a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs. -
_template: a Jinja2 template used to format the input for the LLM. "},{"location":"components-gallery/tasks/prometheuseval/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n ICOL2[generations]\n ICOL3[reference]\n end\n subgraph New columns\n OCOL0[feedback]\n OCOL1[result]\n OCOL2[model_name]\n end\n end\n\n subgraph PrometheusEval\n StepInput[Input Columns: instruction, generation, generations, reference]\n StepOutput[Output Columns: feedback, result, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/prometheuseval/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction to use as reference. -
generation (str , optional): The generated text from the given instruction . This column is required if mode=absolute . -
generations (List[str] , optional): The generated texts from the given instruction . It should contain 2 generations only. This column is required if mode=relative . -
reference (str , optional): The reference / golden answer for the instruction , to be used by the LLM for comparison against. "},{"location":"components-gallery/tasks/prometheuseval/#outputs","title":"Outputs","text":" -
feedback (str ): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided. -
result (Union[int, Literal[\"A\", \"B\"]] ): If mode=absolute , then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative , then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B' . -
model_name (str ): The model name used to generate the feedback and result . "},{"location":"components-gallery/tasks/prometheuseval/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/prometheuseval/#critique-and-evaluate-llm-generation-quality-using-prometheus-2_0","title":"Critique and evaluate LLM generation quality using Prometheus 2_0","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-for-relative-evaluation","title":"Critique for relative evaluation","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generations': ['something done', 'other thing'],\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 'something done',\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-with-a-custom-rubric","title":"Critique with a custom rubric","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n }\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-using-a-reference-answer","title":"Critique using a reference answer","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'reference': 'this is a reference answer',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#references","title":"References","text":""},{"location":"components-gallery/tasks/complexityscorer/","title":"ComplexityScorer","text":"Score instructions based on their complexity using an LLM . ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/complexityscorer/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/complexityscorer/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instructions]\n end\n subgraph New columns\n OCOL0[scores]\n OCOL1[model_name]\n end\n end\n\n subgraph ComplexityScorer\n StepInput[Input Columns: instructions]\n StepOutput[Output Columns: scores, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/complexityscorer/#inputs","title":"Inputs","text":" - instructions (
List[str] ): The list of instructions to be scored. "},{"location":"components-gallery/tasks/complexityscorer/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/complexityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/complexityscorer/#evaluate-the-complexity-of-your-instructions","title":"Evaluate the complexity of your instructions","text":"from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n "},{"location":"components-gallery/tasks/complexityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n \"scores\": [\\n 1, \\n 2\\n ]\\n}'}}]\n "},{"location":"components-gallery/tasks/complexityscorer/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/qualityscorer/","title":"QualityScorer","text":"Score responses based on their quality using an LLM . QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction. "},{"location":"components-gallery/tasks/qualityscorer/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/qualityscorer/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[responses]\n end\n subgraph New columns\n OCOL0[scores]\n OCOL1[model_name]\n end\n end\n\n subgraph QualityScorer\n StepInput[Input Columns: instruction, responses]\n StepOutput[Output Columns: scores, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/qualityscorer/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/qualityscorer/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/qualityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/qualityscorer/#evaluate-the-quality-of-your-instructions","title":"Evaluate the quality of your instructions","text":"from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n# result\n[\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n]\n "},{"location":"components-gallery/tasks/qualityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/qualityscorer/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/clair/","title":"CLAIR","text":"Contrastive Learning from AI Revisions (CLAIR). CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise. "},{"location":"components-gallery/tasks/clair/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n ICOL1[student_solution]\n end\n subgraph New columns\n OCOL0[revision]\n OCOL1[rational]\n OCOL2[model_name]\n end\n end\n\n subgraph CLAIR\n StepInput[Input Columns: task, student_solution]\n StepOutput[Output Columns: revision, rational, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/clair/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/clair/#outputs","title":"Outputs","text":" -
revision (str ): The revised text. -
rational (str ): The rational for the provided revision. -
model_name (str ): The name of the model used to generate the revision and rational. "},{"location":"components-gallery/tasks/clair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/clair/#create-contrastive-preference-pairs","title":"Create contrastive preference pairs","text":"from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n# 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n# {'role': 'user',\n# 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/clair/#references","title":"References","text":""},{"location":"components-gallery/tasks/ultrafeedback/","title":"UltraFeedback","text":"Rank generations focusing on different aspects using an LLM . UltraFeedback: Boosting Language Models with High-quality Feedback. "},{"location":"components-gallery/tasks/ultrafeedback/#attributes","title":"Attributes","text":" - aspect: The aspect to perform with the
UltraFeedback model. The available aspects are: - helpfulness : Evaluate text outputs based on helpfulness. - honesty : Evaluate text outputs based on honesty. - instruction-following : Evaluate text outputs based on given instructions. - truthfulness : Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating : Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\" . "},{"location":"components-gallery/tasks/ultrafeedback/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generations]\n end\n subgraph New columns\n OCOL0[ratings]\n OCOL1[rationales]\n OCOL2[model_name]\n end\n end\n\n subgraph UltraFeedback\n StepInput[Input Columns: instruction, generations]\n StepOutput[Output Columns: ratings, rationales, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/ultrafeedback/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/ultrafeedback/#outputs","title":"Outputs","text":" -
ratings (List[float] ): The ratings for each of the provided text outputs. -
rationales (List[str] ): The rationales for each of the provided text outputs. -
model_name (str ): The name of the model used to generate the ratings and rationales. "},{"location":"components-gallery/tasks/ultrafeedback/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-selected-aspect","title":"Rate generations from different LLMs based on the selected aspect","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 2],\n# 'rationales': ['explanation for 4', 'explanation for and a car'],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# }\n# ]\n "},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-honesty-using-the-default-structured-output","title":"Rate generations from different LLMs based on the honesty, using the default structured output","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n 5,\\n 1\\n] \\n\\n,\"rationales\": [\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-helpfulness-using-the-default-structured-output","title":"Rate generations from different LLMs based on the helpfulness, using the default structured output","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 5],\n# 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n# 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n# 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n# 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n# 'types': [1, 3, 1],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n \"ratings\": [\\n 1,\\n 5\\n ]\\n ,\\n \"rationales\": [\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n ]\\n ,\\n \"rationales_for_rating\": [\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n ]\\n ,\\n \"types\": [\\n 1, 3,\\n 1\\n ]\\n }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/ultrafeedback/#references","title":"References","text":""},{"location":"components-gallery/tasks/pairrm/","title":"PairRM","text":"Rank the candidates based on the input using the LLM model. "},{"location":"components-gallery/tasks/pairrm/#note","title":"Note","text":"This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM . "},{"location":"components-gallery/tasks/pairrm/#attributes","title":"Attributes","text":""},{"location":"components-gallery/tasks/pairrm/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[inputs]\n ICOL1[candidates]\n end\n subgraph New columns\n OCOL0[ranks]\n OCOL1[ranked_candidates]\n OCOL2[model_name]\n end\n end\n\n subgraph PairRM\n StepInput[Input Columns: inputs, candidates]\n StepOutput[Output Columns: ranks, ranked_candidates, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/pairrm/#inputs","title":"Inputs","text":" -
inputs (List[Dict[str, Any]] ): The input text or conversation to rank the candidates for. -
candidates (List[Dict[str, Any]] ): The candidates to rank. "},{"location":"components-gallery/tasks/pairrm/#outputs","title":"Outputs","text":" -
ranks (List[int] ): The ranks of the candidates based on the input. -
ranked_candidates (List[Dict[str, Any]] ): The candidates ranked based on the input. -
model_name (str ): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\" . "},{"location":"components-gallery/tasks/pairrm/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/pairrm/#rank-llm-candidates","title":"Rank LLM candidates","text":"from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'input': 'Hello, how are you?',\n# 'candidates': ['fine', 'good', 'bad'],\n# 'ranks': [2, 1, 3],\n# 'ranked_candidates': ['good', 'fine', 'bad'],\n# 'model_name': 'llm-blender/PairRM',\n# }\n# ]\n "},{"location":"components-gallery/tasks/pairrm/#references","title":"References","text":""},{"location":"components-gallery/tasks/generatesentencepair/","title":"GenerateSentencePair","text":"Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models. "},{"location":"components-gallery/tasks/generatesentencepair/#attributes","title":"Attributes","text":" -
triplet: a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False . -
action: the action to perform to generate the positive sentence. -
context: the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default. -
hard_negative: A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity. "},{"location":"components-gallery/tasks/generatesentencepair/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[anchor]\n end\n subgraph New columns\n OCOL0[positive]\n OCOL1[negative]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateSentencePair\n StepInput[Input Columns: anchor]\n StepOutput[Output Columns: positive, negative, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatesentencepair/#inputs","title":"Inputs","text":" - anchor (
str ): The anchor sentence to generate the positive and negative sentences. "},{"location":"components-gallery/tasks/generatesentencepair/#outputs","title":"Outputs","text":" -
positive (str ): The positive sentence related to the anchor . -
negative (str ): The negative sentence unrelated to the anchor if triplet=True , or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True . -
model_name (str ): The name of the model that was used to generate the sentences. "},{"location":"components-gallery/tasks/generatesentencepair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatesentencepair/#paraphrasing","title":"Paraphrasing","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-semantically-similar-sentences","title":"Generating semantically similar sentences","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-queries","title":"Generating queries","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-answers","title":"Generating answers","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#_1","title":")","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n "},{"location":"components-gallery/tasks/generateembeddings/","title":"GenerateEmbeddings","text":"Generate embeddings using the last hidden state of an LLM . Generate embeddings for a text input using the last hidden state of an LLM , as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/generateembeddings/#attributes","title":"Attributes","text":" - llm: The
LLM to use to generate the embeddings. "},{"location":"components-gallery/tasks/generateembeddings/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[embedding]\n OCOL1[model_name]\n end\n end\n\n subgraph GenerateEmbeddings\n StepInput[Input Columns: text]\n StepOutput[Output Columns: embedding, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generateembeddings/#inputs","title":"Inputs","text":" - text (
str , List[Dict[str, str]] ): The input text or conversation to generate embeddings for. "},{"location":"components-gallery/tasks/generateembeddings/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/generateembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateembeddings/#rank-llm-candidates","title":"Rank LLM candidates","text":"from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n)\nembedder.load()\n\nresult = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n)\n "},{"location":"components-gallery/tasks/generateembeddings/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/textclustering/","title":"TextClustering","text":"Task that clusters a set of texts and generates summary labels for each cluster. This is a GlobalTask that inherits from TextClassification , this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering "},{"location":"components-gallery/tasks/textclustering/#attributes","title":"Attributes","text":" - savefig: Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.
"},{"location":"components-gallery/tasks/textclustering/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n ICOL1[projection]\n ICOL2[cluster_label]\n end\n subgraph New columns\n OCOL0[summary_label]\n OCOL1[model_name]\n end\n end\n\n subgraph TextClustering\n StepInput[Input Columns: text, projection, cluster_label]\n StepOutput[Output Columns: summary_label, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textclustering/#inputs","title":"Inputs","text":" -
text (str ): The reference text we want to obtain labels for. -
projection (List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. -
cluster_label (int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. "},{"location":"components-gallery/tasks/textclustering/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textclustering/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclustering/#generate-labels-for-a-set-of-texts-using-clustering","title":"Generate labels for a set of texts using clustering","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n "},{"location":"components-gallery/tasks/textclustering/#references","title":"References","text":" - text-clustering repository
"},{"location":"components-gallery/tasks/apigensemanticchecker/","title":"APIGenSemanticChecker","text":"Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. "},{"location":"components-gallery/tasks/apigensemanticchecker/#attributes","title":"Attributes","text":" -
system_prompt: System prompt for the task. Has a default one. -
exclude_failed_execution: Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker ). Defaults to True. "},{"location":"components-gallery/tasks/apigensemanticchecker/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[func_desc]\n ICOL1[query]\n ICOL2[answers]\n ICOL3[execution_result]\n end\n subgraph New columns\n OCOL0[thought]\n OCOL1[keep_row_after_semantic_check]\n end\n end\n\n subgraph APIGenSemanticChecker\n StepInput[Input Columns: func_desc, query, answers, execution_result]\n StepOutput[Output Columns: thought, keep_row_after_semantic_check]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#inputs","title":"Inputs","text":" -
func_desc (str ): Description of what the function should do. -
query (str ): Instruction from the user. -
answers (str ): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads . -
execution_result (str ): Result of the function/API executed. "},{"location":"components-gallery/tasks/apigensemanticchecker/#outputs","title":"Outputs","text":" -
thought (str ): Reasoning for the output on whether to keep this output or not. -
keep_row_after_semantic_check (bool ): True or False, can be used to filter afterwards. "},{"location":"components-gallery/tasks/apigensemanticchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-original-implementation","title":"Semantic checker for generated function calls (original implementation)","text":"from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-structured-output","title":"Semantic checker for generated function calls (structured output)","text":"from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#references","title":"References","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/","title":"GenerateTextRetrievalData","text":"Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
query_type: The type of query to be generated, which can be extremely long-tail , long-tail , or common . Defaults to None , meaning that it will be randomly sampled. -
query_length: The length of the query to be generated, which can be less than 5 words , 5 to 15 words , or at least 10 words . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. -
clarity: The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. -
num_words: The number of words in the query to be generated, which can be 50 , 100 , 200 , 300 , 400 , or 500 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[user_query]\n OCOL1[positive_document]\n OCOL2[hard_negative_document]\n OCOL3[model_name]\n end\n end\n\n subgraph GenerateTextRetrievalData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: user_query, positive_document, hard_negative_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatetextretrievaldata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#outputs","title":"Outputs","text":" -
user_query (str ): the user query generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
hard_negative_document (str ): the hard negative document generated by the LLM . -
model_name (str ): the name of the model used to generate the text retrieval data. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/#generate-synthetic-text-retrieval-data-for-training-embedding-models","title":"Generate synthetic text retrieval data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatetextretrievaldata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/","title":"GenerateShortTextMatchingData","text":"Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[positive_document]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateShortTextMatchingData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input, positive_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#outputs","title":"Outputs","text":" -
input (str ): the input generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
model_name (str ): the name of the model used to generate the short text matching data. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#generate-synthetic-short-text-matching-data-for-training-embedding-models","title":"Generate synthetic short text matching data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/","title":"GenerateLongTextMatchingData","text":"Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[positive_document]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateLongTextMatchingData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input, positive_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#outputs","title":"Outputs","text":" -
input (str ): the input generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
model_name (str ): the name of the model used to generate the long text matching data. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#generate-synthetic-long-text-matching-data-for-training-embedding-models","title":"Generate synthetic long text matching data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatetextclassificationdata/","title":"GenerateTextClassificationData","text":"Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
difficulty: The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. -
clarity: The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input_text]\n OCOL1[label]\n OCOL2[misleading_label]\n OCOL3[model_name]\n end\n end\n\n subgraph GenerateTextClassificationData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input_text, label, misleading_label, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatetextclassificationdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#outputs","title":"Outputs","text":" -
input_text (str ): the input text generated by the LLM . -
label (str ): the label generated by the LLM . -
misleading_label (str ): the misleading label generated by the LLM . -
model_name (str ): the name of the model used to generate the text classification data. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextclassificationdata/#generate-synthetic-text-classification-data-for-training-embedding-models","title":"Generate synthetic text classification data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatetextclassificationdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/structuredgeneration/","title":"StructuredGeneration","text":"Generate structured content for a given instruction using an LLM . StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction . The model_name also returned as part of the output in order to enhance it. "},{"location":"components-gallery/tasks/structuredgeneration/#attributes","title":"Attributes","text":" - use_system_prompt: Whether to use the system prompt in the generation. Defaults to
True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. "},{"location":"components-gallery/tasks/structuredgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[structured_output]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph StructuredGeneration\n StepInput[Input Columns: instruction, structured_output]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/structuredgeneration/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction to generate structured content from. -
structured_output (Dict[str, Any] ): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema , where format should be one of json or regex , and the schema should be either the JSON schema or the regex pattern, respectively. "},{"location":"components-gallery/tasks/structuredgeneration/#outputs","title":"Outputs","text":" -
generation (str ): The generated text matching the provided schema, if possible. -
model_name (str ): The name of the model used to generate the text. "},{"location":"components-gallery/tasks/structuredgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-json-schema","title":"Generate structured output from a JSON schema","text":"from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-regex-pattern-only-works-with-llms-that-support-regex-the-providers-using-outlines","title":"Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines)","text":"from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/monolingualtripletgenerator/","title":"MonolingualTripletGenerator","text":"Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
unit: The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. -
high_score: The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. -
low_score: The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[S1]\n OCOL1[S2]\n OCOL2[S3]\n OCOL3[model_name]\n end\n end\n\n subgraph MonolingualTripletGenerator\n StepOutput[Output Columns: S1, S2, S3, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#outputs","title":"Outputs","text":" -
S1 (str ): the first sentence generated by the LLM . -
S2 (str ): the second sentence generated by the LLM . -
S3 (str ): the third sentence generated by the LLM . -
model_name (str ): the name of the model used to generate the monolingual triplets. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/monolingualtripletgenerator/#generate-monolingual-triplets-for-training-embedding-models","title":"Generate monolingual triplets for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/bitextretrievalgenerator/","title":"BitextRetrievalGenerator","text":"Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#attributes","title":"Attributes","text":" -
source_language: The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
target_language: The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
unit: The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. -
high_score: The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. -
low_score: The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[S1]\n OCOL1[S2]\n OCOL2[S3]\n OCOL3[model_name]\n end\n end\n\n subgraph BitextRetrievalGenerator\n StepOutput[Output Columns: S1, S2, S3, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#outputs","title":"Outputs","text":" -
S1 (str ): the first sentence generated by the LLM . -
S2 (str ): the second sentence generated by the LLM . -
S3 (str ): the third sentence generated by the LLM . -
model_name (str ): the name of the model used to generate the bitext retrieval data. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/bitextretrievalgenerator/#generate-bitext-retrieval-data-for-training-embedding-models","title":"Generate bitext retrieval data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/","title":"EmbeddingTaskGenerator","text":"Generate task descriptions for embedding-related tasks using an LLM . EmbeddingTaskGenerator is a GeneratorTask that doesn't receieve any input besides the provided attributes that generates task descriptions for embedding-related tasks using a pre-defined prompt based on the category attribute. The category attribute should be one of the following: - `text-retrieval`: Generate task descriptions for text retrieval tasks.\n- `text-matching-short`: Generate task descriptions for short text matching tasks.\n- `text-matching-long`: Generate task descriptions for long text matching tasks.\n- `text-classification`: Generate task descriptions for text classification tasks.\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#attributes","title":"Attributes","text":" -
category: The category of the task to be generated, which can either be text-retrieval , text-matching-short , text-matching-long , or text-classification . -
flatten_tasks: Whether to flatten the tasks i.e. since a list of tasks is generated by the LLM , this attribute indicates whether to flatten the list or not. Defaults to False , meaning that running this task with num_generations=1 will return a distilabel.Distiset with one row only containing a list with around 20 tasks; otherwise, if set to True , it will return a distilabel.Distiset with around 20 rows, each containing one task. "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[tasks]\n OCOL1[task]\n OCOL2[model_name]\n end\n end\n\n subgraph EmbeddingTaskGenerator\n StepOutput[Output Columns: tasks, task, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#outputs","title":"Outputs","text":" -
tasks (List[str] ): the list of tasks generated by the LLM . -
task (str ): the task generated by the LLM if flatten_tasks=True . -
model_name (str ): the name of the model used to generate the tasks. "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/embeddingtaskgenerator/#generate-embedding-tasks-for-text-retrieval","title":"Generate embedding tasks for text retrieval","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/llms/","title":"LLMs Gallery","text":" -
AnthropicLLM Anthropic LLM implementation running the Async API client. AnthropicLLM -
OpenAILLM OpenAI LLM implementation running the async API client. OpenAILLM -
AnyscaleLLM Anyscale LLM implementation running the async API client of OpenAI. AnyscaleLLM -
AzureOpenAILLM Azure OpenAI LLM implementation running the async API client. AzureOpenAILLM -
TogetherLLM TogetherLLM LLM implementation running the async API client of OpenAI. TogetherLLM -
ClientvLLM A client for the vLLM server implementing the OpenAI API specification. ClientvLLM -
CohereLLM Cohere API implementation using the async client for concurrent text generation. CohereLLM -
GroqLLM Groq API implementation using the async client for concurrent text generation. GroqLLM -
InferenceEndpointsLLM InferenceEndpoints LLM implementation running the async API client. InferenceEndpointsLLM -
LiteLLM LiteLLM implementation running the async API client. LiteLLM -
MistralLLM Mistral LLM implementation running the async API client. MistralLLM -
MixtureOfAgentsLLM Mixture-of-Agents implementation. MixtureOfAgentsLLM -
OllamaLLM Ollama LLM implementation running the Async API client. OllamaLLM -
VertexAILLM VertexAI LLM implementation running the async API clients for Gemini. VertexAILLM -
TransformersLLM Hugging Face transformers library LLM implementation using the text generation TransformersLLM -
LlamaCppLLM llama.cpp LLM implementation running the Python bindings for the C++ code. LlamaCppLLM -
vLLM vLLM library LLM implementation. vLLM "},{"location":"components-gallery/llms/anthropicllm/","title":"AnthropicLLM","text":"Anthropic LLM implementation running the Async API client. "},{"location":"components-gallery/llms/anthropicllm/#attributes","title":"Attributes","text":" -
model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview. -
api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. -
base_url: the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally. -
timeout: the maximum time in seconds to wait for a response. Defaults to 600.0 . -
max_retries: The maximum number of times to retry the request before failing. Defaults to 6 . -
http_client: if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. -
_aclient: the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/anthropicllm/#runtime-parameters","title":"Runtime Parameters","text":" -
api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. -
base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\" . -
timeout: the maximum time in seconds to wait for a response. Defaults to 600.0 . -
max_retries: the maximum number of times to retry the request before failing. Defaults to 6 . "},{"location":"components-gallery/llms/anthropicllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anthropicllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/anthropicllm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/openaillm/","title":"OpenAILLM","text":"OpenAI LLM implementation running the async API client. "},{"location":"components-gallery/llms/openaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here. -
base_url: the base URL to use for the OpenAI API requests. Defaults to None , which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set. -
api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . "},{"location":"components-gallery/llms/openaillm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the OpenAI API requests. Defaults to None . -
api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . "},{"location":"components-gallery/llms/openaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/openaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-with-batch-api-offline-batch-generation","title":"Generate with Batch API (offline batch generation)","text":"from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n "},{"location":"components-gallery/llms/anyscalellm/","title":"AnyscaleLLM","text":"Anyscale LLM implementation running the async API client of OpenAI. "},{"location":"components-gallery/llms/anyscalellm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM, e.g., google/gemma-7b-it . See the supported models under the \"Text Generation -> Supported Models\" section here. -
base_url: the base URL to use for the Anyscale API requests. Defaults to None , which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set. -
api_key: the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set. -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. "},{"location":"components-gallery/llms/anyscalellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anyscalellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/","title":"AzureOpenAILLM","text":"Azure OpenAI LLM implementation running the async API client. "},{"location":"components-gallery/llms/azureopenaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM i.e. the name of the Azure deployment. -
base_url: the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT . Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set. -
api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set. -
api_version: the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set. "},{"location":"components-gallery/llms/azureopenaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/azureopenaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/togetherllm/","title":"TogetherLLM","text":"TogetherLLM LLM implementation running the async API client of OpenAI. "},{"location":"components-gallery/llms/togetherllm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here. -
base_url: the base URL to use for the Together API can be set with TOGETHER_BASE_URL . Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set. -
api_key: the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set. -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. "},{"location":"components-gallery/llms/togetherllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/togetherllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/clientvllm/","title":"ClientvLLM","text":"A client for the vLLM server implementing the OpenAI API specification. "},{"location":"components-gallery/llms/clientvllm/#attributes","title":"Attributes","text":" -
base_url: the base URL of the vLLM server. Defaults to \"http://localhost:8000\" . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . -
tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None . -
tokenizer_revision: the revision of the tokenizer to load. Defaults to None . -
_aclient: the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None . "},{"location":"components-gallery/llms/clientvllm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\" . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . "},{"location":"components-gallery/llms/clientvllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/clientvllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n)\n# [\n# [\n# \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n# \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n# \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n# ]\n# ]\n "},{"location":"components-gallery/llms/coherellm/","title":"CohereLLM","text":"Cohere API implementation using the async client for concurrent text generation. "},{"location":"components-gallery/llms/coherellm/#attributes","title":"Attributes","text":" -
model: the name of the model from the Cohere API to use for the generation. -
base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . -
api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
client_name: the name of the client to use for the API requests. Defaults to \"distilabel\" . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_ChatMessage: the ChatMessage class from the cohere package. -
_aclient: the AsyncClient client from the cohere package. "},{"location":"components-gallery/llms/coherellm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . -
api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
client_name: the name of the client to use for the API requests. Defaults to \"distilabel\" . "},{"location":"components-gallery/llms/coherellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/coherellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/groqllm/","title":"GroqLLM","text":"Groq API implementation using the async client for concurrent text generation. "},{"location":"components-gallery/llms/groqllm/#attributes","title":"Attributes","text":" -
model: the name of the model from the Groq API to use for the generation. -
base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . -
api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. -
_aclient: the AsyncGroq client from the groq package. "},{"location":"components-gallery/llms/groqllm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . -
api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . "},{"location":"components-gallery/llms/groqllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/groqllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/inferenceendpointsllm/","title":"InferenceEndpointsLLM","text":"InferenceEndpoints LLM implementation running the async API client. This LLM will internally use huggingface_hub.AsyncInferenceClient . "},{"location":"components-gallery/llms/inferenceendpointsllm/#attributes","title":"Attributes","text":" -
model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None . -
endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to None . -
endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to None . -
base_url: the base URL to use for the Inference Endpoints API requests. -
api_key: the API key to authenticate the requests to the Inference Endpoints API. -
tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None , but defining one is recommended to properly format the prompt. -
model_display_name: the model display name to use for the LLM. Defaults to None . -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. "},{"location":"components-gallery/llms/inferenceendpointsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/inferenceendpointsllm/#free-serverless-inference-api-set-the-input_batch_size-of-the-task-that-uses-this-to-avoid-model-is-overloaded","title":"Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints","title":"Dedicated Inference Endpoints","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints-or-tgi","title":"Dedicated Inference Endpoints or TGI","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n "},{"location":"components-gallery/llms/litellm/","title":"LiteLLM","text":"LiteLLM implementation running the async API client. "},{"location":"components-gallery/llms/litellm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc. -
verbose: whether to log the LiteLLM client's logs. Defaults to False . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . "},{"location":"components-gallery/llms/litellm/#runtime-parameters","title":"Runtime Parameters","text":" - verbose: whether to log the LiteLLM client's logs. Defaults to
False . "},{"location":"components-gallery/llms/litellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/litellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/mistralllm/","title":"MistralLLM","text":"Mistral LLM implementation running the async API client. "},{"location":"components-gallery/llms/mistralllm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc. -
endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\". -
api_key: the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. -
max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5 . -
timeout: the maximum time in seconds to wait for a response. Defaults to 120 . -
max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. -
_aclient: the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/mistralllm/#runtime-parameters","title":"Runtime Parameters","text":" -
api_key: the API key to authenticate the requests to the Mistral API. -
max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5 . -
timeout: the maximum time in seconds to wait for a response. Defaults to 120 . -
max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64 . "},{"location":"components-gallery/llms/mistralllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mistralllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/mixtureofagentsllm/","title":"MixtureOfAgentsLLM","text":"Mixture-of-Agents implementation. An LLM class that leverages LLM s collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLM s proposing/generating outputs that LLM s from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response. "},{"location":"components-gallery/llms/mixtureofagentsllm/#attributes","title":"Attributes","text":" -
aggregator_llm: The LLM that aggregates the outputs of the proposer LLM s. -
proposers_llms: The list of LLM s that propose outputs to be aggregated. -
rounds: The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1 . "},{"location":"components-gallery/llms/mixtureofagentsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mixtureofagentsllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n)\n "},{"location":"components-gallery/llms/mixtureofagentsllm/#references","title":"References","text":" - Mixture-of-Agents Enhances Large Language Model Capabilities
"},{"location":"components-gallery/llms/ollamallm/","title":"OllamaLLM","text":"Ollama LLM implementation running the Async API client. "},{"location":"components-gallery/llms/ollamallm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"notus\". -
host: the Ollama server host. -
timeout: the timeout for the LLM. Defaults to 120 . -
_aclient: the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/ollamallm/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/llms/ollamallm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/ollamallm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/vertexaillm/","title":"VertexAILLM","text":"VertexAI LLM implementation running the async API clients for Gemini. -
Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods: - Setting
GOOGLE_CLOUD_CREDENTIALS environment variable - Using
gcloud auth application-default login command - Using
vertexai.init function from the google-cloud-aiplatform library "},{"location":"components-gallery/llms/vertexaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models. -
_aclient: the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/vertexaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vertexaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/transformersllm/","title":"TransformersLLM","text":"Hugging Face transformers library LLM implementation using the text generation pipeline. "},{"location":"components-gallery/llms/transformersllm/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
model_kwargs: additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model. -
tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None . -
use_fast: whether to use a fast tokenizer or not. Defaults to True . -
chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . -
device: the name or index of the device where the model will be loaded. Defaults to None . -
device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . "},{"location":"components-gallery/llms/transformersllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/transformersllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/","title":"LlamaCppLLM","text":"llama.cpp LLM implementation running the Python bindings for the C++ code. "},{"location":"components-gallery/llms/llamacppllm/#attributes","title":"Attributes","text":" -
model_path: contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings. -
n_gpu_layers: the number of layers to use for the GPU. Defaults to -1 , meaning that the available GPU device will be used. -
chat_format: the chat format to use for the model. Defaults to None , which means the Llama format will be used. -
n_ctx: the context size to use for the model. Defaults to 512 . -
n_batch: the prompt processing maximum batch size to use for the model. Defaults to 512 . -
seed: random seed to use for the generation. Defaults to 4294967295 . -
verbose: whether to print verbose output. Defaults to False . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . -
_model: the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. "},{"location":"components-gallery/llms/llamacppllm/#runtime-parameters","title":"Runtime Parameters","text":" -
model_path: the path to the GGUF quantized model. -
n_gpu_layers: the number of layers to use for the GPU. Defaults to -1 . -
chat_format: the chat format to use for the model. Defaults to None . -
verbose: whether to print verbose output. Defaults to False . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . "},{"location":"components-gallery/llms/llamacppllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/llamacppllm/#generate-text","title":"Generate text","text":"from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/#generate-structured-data","title":"Generate structured data","text":"from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/#references","title":"References","text":" -
llama.cpp -
llama-cpp-python "},{"location":"components-gallery/llms/vllm/","title":"vLLM","text":"vLLM library LLM implementation. "},{"location":"components-gallery/llms/vllm/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
dtype: the data type to use for the model. Defaults to auto . -
trust_remote_code: whether to trust the remote code when loading the model. Defaults to False . -
quantization: the quantization mode to use for the model. Defaults to None . -
revision: the revision of the model to load. Defaults to None . -
tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None . -
tokenizer_mode: the mode to use for the tokenizer. Defaults to auto . -
tokenizer_revision: the revision of the tokenizer to load. Defaults to None . -
skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults to False . -
chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
seed: the seed to use for the random number generator. Defaults to 0 . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . -
_model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. -
_tokenizer: the tokenizer instance used to format the prompt before passing it to the LLM . This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . "},{"location":"components-gallery/llms/vllm/#runtime-parameters","title":"Runtime Parameters","text":" - extra_kwargs: additional dictionary of keyword arguments that will be passed to the
LLM class of vllm library. "},{"location":"components-gallery/llms/vllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/vllm/#generate-structured-data","title":"Generate structured data","text":"from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/embeddings/","title":"Embeddings Gallery","text":" -
SentenceTransformerEmbeddings sentence-transformers library implementation for embedding generation. SentenceTransformerEmbeddings -
vLLMEmbeddings vllm library implementation for embedding generation. vLLMEmbeddings "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/","title":"SentenceTransformerEmbeddings","text":"sentence-transformers library implementation for embedding generation. "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None . -
prompts: a dictionary containing prompts to be used with the model. Defaults to None . -
default_prompt_name: the default prompt (in prompts ) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
truncate_dim: the dimension to truncate the sentence embeddings. Defaults to None . -
model_kwargs: extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None . -
tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None . -
config_kwargs: extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None . -
precision: the dtype that will have the resulting embeddings. Defaults to \"float32\" . -
normalize_embeddings: whether to normalize the embeddings so they have a length of 1. Defaults to None . "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n "},{"location":"components-gallery/embeddings/vllmembeddings/","title":"vLLMEmbeddings","text":"vllm library implementation for embedding generation. "},{"location":"components-gallery/embeddings/vllmembeddings/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
dtype: the data type to use for the model. Defaults to auto . -
trust_remote_code: whether to trust the remote code when loading the model. Defaults to False . -
quantization: the quantization mode to use for the model. Defaults to None . -
revision: the revision of the model to load. Defaults to None . -
enforce_eager: whether to enforce eager execution. Defaults to True . -
seed: the seed to use for the random number generator. Defaults to 0 . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . -
_model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. "},{"location":"components-gallery/embeddings/vllmembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/vllmembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n "},{"location":"components-gallery/embeddings/vllmembeddings/#references","title":"References","text":" - Offline inference embeddings
"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Distilabel","text":"Synthesize data for AI and add feedback on the fly! Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers. -
Get started in 5 minutes! Install distilabel with pip and run your first Pipeline to generate and evaluate synthetic data. Quickstart -
How-to guides Get familiar with the basics of distilabel. Learn how to define steps , tasks and llms and run your Pipeline . Learn more "},{"location":"#why-use-distilabel","title":"Why use distilabel?","text":"Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback. Improve your AI output quality through data quality Compute is expensive and output quality is important. We help you focus on data quality, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time achieving and keeping high-quality standards for your synthetic data. Take control of your data and models Ownership of data for fine-tuning your own LLMs is not easy but distilabel can help you to get started. We integrate AI feedback from any LLM provider out there using one unified API. Improve efficiency by quickly iterating on the right data and models Synthesize and judge data with latest research papers while ensuring flexibility, scalability and fault tolerance. So you can focus on improving your data and training your models. "},{"location":"#what-do-people-build-with-distilabel","title":"What do people build with distilabel?","text":"The Argilla community uses distilabel to create amazing datasets and models. - The 1M OpenHermesPreference is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to synthesize data on an immense scale.
- Our distilabeled Intel Orca DPO dataset and the improved OpenHermes model, show how we improve model performance by filtering out 50% of the original dataset through AI feedback.
- The haiku DPO data outlines how anyone can create a dataset for a specific task and the latest research papers to improve the quality of the dataset.
"},{"location":"api/cli/","title":"Command Line Interface (CLI)","text":"This section contains the API reference for the CLI. For more information on how to use the CLI, see Tutorial - CLI. "},{"location":"api/cli/#utility-functions-for-the-distilabel-pipeline-sub-commands","title":"Utility functions for the distilabel pipeline sub-commands","text":"Here are some utility functions to help working with the pipelines in the console. "},{"location":"api/cli/#distilabel.cli.pipeline.utils","title":"utils ","text":""},{"location":"api/cli/#distilabel.cli.pipeline.utils.parse_runtime_parameters","title":"parse_runtime_parameters(params) ","text":"Parses the runtime parameters from the CLI format to the format expected by the Pipeline.run method. The CLI format is a list of tuples, where the first element is a list of keys and the second element is the value. Parameters: Name Type Description Default params List[Tuple[List[str], str]] A list of tuples, where the first element is a list of keys and the second element is the value. required Returns: Type Description Dict[str, Dict[str, Any]] A dictionary with the runtime parameters in the format expected by the Dict[str, Dict[str, Any]] Pipeline.run method. Source code in src/distilabel/cli/pipeline/utils.py def parse_runtime_parameters(\n params: List[Tuple[List[str], str]],\n) -> Dict[str, Dict[str, Any]]:\n \"\"\"Parses the runtime parameters from the CLI format to the format expected by the\n `Pipeline.run` method. The CLI format is a list of tuples, where the first element is\n a list of keys and the second element is the value.\n\n Args:\n params: A list of tuples, where the first element is a list of keys and the\n second element is the value.\n\n Returns:\n A dictionary with the runtime parameters in the format expected by the\n `Pipeline.run` method.\n \"\"\"\n runtime_params = {}\n for keys, value in params:\n current = runtime_params\n for i, key in enumerate(keys):\n if i == len(keys) - 1:\n current[key] = value\n else:\n current = current.setdefault(key, {})\n return runtime_params\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.valid_http_url","title":"valid_http_url(url) ","text":"Check if the URL is a valid HTTP URL. Parameters: Name Type Description Default url str The URL to check. required Returns: Type Description bool True , if the URL is a valid HTTP URL. False , otherwise. Source code in src/distilabel/cli/pipeline/utils.py def valid_http_url(url: str) -> bool:\n \"\"\"Check if the URL is a valid HTTP URL.\n\n Args:\n url: The URL to check.\n\n Returns:\n `True`, if the URL is a valid HTTP URL. `False`, otherwise.\n \"\"\"\n try:\n TypeAdapter(HttpUrl).validate_python(url) # type: ignore\n except ValidationError:\n return False\n\n return True\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_config_from_url","title":"get_config_from_url(url) ","text":"Loads the pipeline configuration from a URL pointing to a JSON or YAML file. Parameters: Name Type Description Default url str The URL pointing to the pipeline configuration file. required Returns: Type Description Dict[str, Any] The pipeline configuration as a dictionary. Raises: Type Description ValueError If the file format is not supported. Source code in src/distilabel/cli/pipeline/utils.py def get_config_from_url(url: str) -> Dict[str, Any]:\n \"\"\"Loads the pipeline configuration from a URL pointing to a JSON or YAML file.\n\n Args:\n url: The URL pointing to the pipeline configuration file.\n\n Returns:\n The pipeline configuration as a dictionary.\n\n Raises:\n ValueError: If the file format is not supported.\n \"\"\"\n if not url.endswith((\".json\", \".yaml\", \".yml\")):\n raise DistilabelUserError(\n f\"Unsupported file format for '{url}'. Only JSON and YAML are supported\",\n page=\"sections/how_to_guides/basic/pipeline/?h=seriali#serializing-the-pipeline\",\n )\n response = _download_remote_file(url)\n\n if url.endswith((\".yaml\", \".yml\")):\n content = response.content.decode(\"utf-8\")\n return yaml.safe_load(content)\n\n return response.json()\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline_from_url","title":"get_pipeline_from_url(url, pipeline_name='pipeline') ","text":"Downloads the file to the current working directory and loads the pipeline object from a python script. Parameters: Name Type Description Default url str The URL pointing to the python script with the pipeline definition. required pipeline_name str The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:... . 'pipeline' Returns: Type Description BasePipeline The pipeline instantiated. Raises: Type Description ValueError If the file format is not supported. Source code in src/distilabel/cli/pipeline/utils.py def get_pipeline_from_url(url: str, pipeline_name: str = \"pipeline\") -> \"BasePipeline\":\n \"\"\"Downloads the file to the current working directory and loads the pipeline object\n from a python script.\n\n Args:\n url: The URL pointing to the python script with the pipeline definition.\n pipeline_name: The name of the pipeline in the script.\n I.e: `with Pipeline(...) as pipeline:...`.\n\n Returns:\n The pipeline instantiated.\n\n Raises:\n ValueError: If the file format is not supported.\n \"\"\"\n if not url.endswith(\".py\"):\n raise DistilabelUserError(\n f\"Unsupported file format for '{url}'. It must be a python file.\",\n page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n )\n response = _download_remote_file(url)\n\n content = response.content.decode(\"utf-8\")\n script_local = Path.cwd() / Path(url).name\n script_local.write_text(content)\n\n # Add the current working directory to sys.path\n sys.path.insert(0, os.getcwd())\n module = importlib.import_module(str(Path(url).stem))\n pipeline = getattr(module, pipeline_name, None)\n if not pipeline:\n raise ImportError(\n f\"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported\"\n )\n\n return pipeline\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline","title":"get_pipeline(config_or_script, pipeline_name='pipeline') ","text":"Get a pipeline from a configuration file or a remote python script. Parameters: Name Type Description Default config_or_script str The path or URL to the pipeline configuration file or URL to a python script. required pipeline_name str The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:... . 'pipeline' Returns: Type Description BasePipeline The pipeline. Raises: Type Description ValueError If the file format is not supported. FileNotFoundError If the configuration file does not exist. Source code in src/distilabel/cli/pipeline/utils.py def get_pipeline(\n config_or_script: str, pipeline_name: str = \"pipeline\"\n) -> \"BasePipeline\":\n \"\"\"Get a pipeline from a configuration file or a remote python script.\n\n Args:\n config_or_script: The path or URL to the pipeline configuration file\n or URL to a python script.\n pipeline_name: The name of the pipeline in the script.\n I.e: `with Pipeline(...) as pipeline:...`.\n\n Returns:\n The pipeline.\n\n Raises:\n ValueError: If the file format is not supported.\n FileNotFoundError: If the configuration file does not exist.\n \"\"\"\n config = script = None\n if config_or_script.endswith((\".json\", \".yaml\", \".yml\")):\n config = config_or_script\n elif config_or_script.endswith(\".py\"):\n script = config_or_script\n else:\n raise DistilabelUserError(\n \"The file must be a valid config file or python script with a pipeline.\",\n page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n )\n\n if valid_http_url(config_or_script):\n if config:\n data = get_config_from_url(config)\n return Pipeline.from_dict(data)\n return get_pipeline_from_url(script, pipeline_name=pipeline_name)\n\n if not config:\n raise ValueError(\n f\"To run a pipeline from a python script, run it as `python {script}`\"\n )\n\n if Path(config).is_file():\n return Pipeline.from_file(config)\n\n raise FileNotFoundError(f\"File '{config_or_script}' does not exist.\")\n "},{"location":"api/cli/#distilabel.cli.pipeline.utils.display_pipeline_information","title":"display_pipeline_information(pipeline) ","text":"Displays the pipeline information to the console. Parameters: Name Type Description Default pipeline BasePipeline The pipeline. required Source code in src/distilabel/cli/pipeline/utils.py def display_pipeline_information(pipeline: \"BasePipeline\") -> None:\n \"\"\"Displays the pipeline information to the console.\n\n Args:\n pipeline: The pipeline.\n \"\"\"\n from rich.console import Console\n\n Console().print(_build_pipeline_panel(pipeline))\n "},{"location":"api/distiset/","title":"Distiset","text":"This section contains the API reference for the Distiset. For more information on how to use the CLI, see Tutorial - CLI. "},{"location":"api/distiset/#distilabel.distiset.Distiset","title":"Distiset ","text":" Bases: dict Convenient wrapper around datasets.Dataset to push to the Hugging Face Hub. It's a dictionary where the keys correspond to the different leaf_steps from the internal DAG and the values are datasets.Dataset . Attributes: Name Type Description _pipeline_path Optional[Path] Optional path to the pipeline.yaml file that generated the dataset. Defaults to None . _artifacts_path Optional[Path] Optional path to the directory containing the generated artifacts by the pipeline steps. Defaults to None . _log_filename_path Optional[Path] Optional path to the pipeline.log file that generated was written by the pipeline. Defaults to None . _citations Optional[List[str]] Optional list containing citations that will be included in the dataset card. Defaults to None . Source code in src/distilabel/distiset.py class Distiset(dict):\n \"\"\"Convenient wrapper around `datasets.Dataset` to push to the Hugging Face Hub.\n\n It's a dictionary where the keys correspond to the different leaf_steps from the internal\n `DAG` and the values are `datasets.Dataset`.\n\n Attributes:\n _pipeline_path: Optional path to the `pipeline.yaml` file that generated the dataset.\n Defaults to `None`.\n _artifacts_path: Optional path to the directory containing the generated artifacts\n by the pipeline steps. Defaults to `None`.\n _log_filename_path: Optional path to the `pipeline.log` file that generated was written\n by the pipeline. Defaults to `None`.\n _citations: Optional list containing citations that will be included in the dataset\n card. Defaults to `None`.\n \"\"\"\n\n _pipeline_path: Optional[Path] = None\n _artifacts_path: Optional[Path] = None\n _log_filename_path: Optional[Path] = None\n _citations: Optional[List[str]] = None\n\n def push_to_hub(\n self,\n repo_id: str,\n private: bool = False,\n token: Optional[str] = None,\n generate_card: bool = True,\n include_script: bool = False,\n **kwargs: Any,\n ) -> None:\n \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n corresponding to the leaf step that generated it.\n\n Args:\n repo_id:\n The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n of the logged-in user.\n private:\n Whether the dataset repository should be set to private or not. Only affects repository creation:\n a repository that already exists will not be affected by that parameter.\n token:\n An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n if no token is passed and the user is not logged-in.\n generate_card:\n Whether to generate a dataset card or not. Defaults to True.\n include_script:\n Whether you want to push the pipeline script to the hugging face hub to share it.\n If set to True, the name of the script that was run to create the distiset will be\n automatically determined, and that will be the name of the file uploaded to your\n repository. Take into account, this operation only makes sense for a distiset obtained\n from calling `Pipeline.run()` method. Defaults to False.\n **kwargs:\n Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n Raises:\n ValueError: If no token is provided and couldn't be retrieved automatically.\n \"\"\"\n script_filename = sys.argv[0]\n filename_py = (\n script_filename.split(\"/\")[-1]\n if \"/\" in script_filename\n else script_filename\n )\n script_path = Path.cwd() / script_filename\n\n if token is None:\n token = get_hf_token(self.__class__.__name__, \"token\")\n\n for name, dataset in self.items():\n dataset.push_to_hub(\n repo_id=repo_id,\n config_name=name,\n private=private,\n token=token,\n **kwargs,\n )\n\n if self.artifacts_path:\n upload_folder(\n repo_id=repo_id,\n folder_path=self.artifacts_path,\n path_in_repo=\"artifacts\",\n token=token,\n repo_type=\"dataset\",\n commit_message=\"Include pipeline artifacts\",\n )\n\n if include_script and script_path.exists():\n upload_file(\n path_or_fileobj=script_path,\n path_in_repo=filename_py,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n commit_message=\"Include pipeline script\",\n )\n\n if generate_card:\n self._generate_card(\n repo_id, token, include_script=include_script, filename_py=filename_py\n )\n\n def _get_card(\n self,\n repo_id: str,\n token: Optional[str] = None,\n include_script: bool = False,\n filename_py: Optional[str] = None,\n ) -> DistilabelDatasetCard:\n \"\"\"Generates the dataset card for the `Distiset`.\n\n Note:\n If `repo_id` and `token` are provided, it will extract the metadata from the README.md file\n on the hub.\n\n Args:\n repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.\n token: The token to authenticate with the Hugging Face Hub.\n We assume that if it's provided, the dataset will be in the Hugging Face Hub,\n so the README metadata will be extracted from there.\n include_script: Whether to upload the script to the hugging face repository.\n filename_py: The name of the script. If `include_script` is True, the script will\n be uploaded to the repository using this name, otherwise it won't be used.\n\n Returns:\n The dataset card for the `Distiset`.\n \"\"\"\n sample_records = {}\n for name, dataset in self.items():\n record = (\n dataset[0] if not isinstance(dataset, dict) else dataset[\"train\"][0]\n )\n for key, value in record.items():\n # If list is too big, the `README.md` generated will be huge so we truncate it\n if isinstance(value, list):\n length = len(value)\n if length < 10:\n continue\n record[key] = value[:10]\n record[key].append(\n f\"... (truncated - showing 10 of {length} elements)\"\n )\n sample_records[name] = record\n\n readme_metadata = {}\n if repo_id and token:\n readme_metadata = self._extract_readme_metadata(repo_id, token)\n\n metadata = {\n **readme_metadata,\n \"size_categories\": size_categories_parser(\n max(len(dataset) for dataset in self.values())\n ),\n \"tags\": [\"synthetic\", \"distilabel\", \"rlaif\"],\n }\n\n card = DistilabelDatasetCard.from_template(\n card_data=DatasetCardData(**metadata),\n repo_id=repo_id,\n sample_records=sample_records,\n include_script=include_script,\n filename_py=filename_py,\n artifacts=self._get_artifacts_metadata(),\n references=self.citations,\n )\n\n return card\n\n def _get_artifacts_metadata(self) -> Dict[str, List[Dict[str, Any]]]:\n \"\"\"Gets a dictionary with the metadata of the artifacts generated by the pipeline steps.\n\n Returns:\n A dictionary in which the key is the name of the step and the value is a list\n of dictionaries, each of them containing the name and metadata of the step artifact.\n \"\"\"\n if not self.artifacts_path:\n return {}\n\n def iterdir_ignore_hidden(path: Path) -> Generator[Path, None, None]:\n return (f for f in Path(path).iterdir() if not f.name.startswith(\".\"))\n\n artifacts_metadata = defaultdict(list)\n for step_artifacts_dir in iterdir_ignore_hidden(self.artifacts_path):\n step_name = step_artifacts_dir.stem\n for artifact_dir in iterdir_ignore_hidden(step_artifacts_dir):\n artifact_name = artifact_dir.stem\n metadata_path = artifact_dir / \"metadata.json\"\n metadata = json.loads(metadata_path.read_text())\n artifacts_metadata[step_name].append(\n {\"name\": artifact_name, \"metadata\": metadata}\n )\n\n return dict(artifacts_metadata)\n\n def _extract_readme_metadata(\n self, repo_id: str, token: Optional[str]\n ) -> Dict[str, Any]:\n \"\"\"Extracts the metadata from the README.md file of the dataset repository.\n\n We have to download the previous README.md file in the repo, extract the metadata from it,\n and generate a dict again to be passed thorough the `DatasetCardData` object.\n\n Args:\n repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n\n Returns:\n The metadata extracted from the README.md file of the dataset repository as a dict.\n \"\"\"\n readme_path = Path(\n hf_hub_download(repo_id, \"README.md\", repo_type=\"dataset\", token=token)\n )\n # Remove the '---' from the metadata\n metadata = re.findall(r\"---\\n(.*?)\\n---\", readme_path.read_text(), re.DOTALL)[0]\n metadata = yaml.safe_load(metadata)\n return metadata\n\n def _generate_card(\n self,\n repo_id: str,\n token: str,\n include_script: bool = False,\n filename_py: Optional[str] = None,\n ) -> None:\n \"\"\"Generates a dataset card and pushes it to the Hugging Face Hub, and\n if the `pipeline.yaml` path is available in the `Distiset`, uploads that\n to the same repository.\n\n Args:\n repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n include_script: Whether to upload the script to the hugging face repository.\n filename_py: The name of the script. If `include_script` is True, the script will\n be uploaded to the repository using this name, otherwise it won't be used.\n \"\"\"\n card = self._get_card(\n repo_id=repo_id,\n token=token,\n include_script=include_script,\n filename_py=filename_py,\n )\n\n card.push_to_hub(\n repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n if self.pipeline_path:\n # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.\n HfApi().upload_file(\n path_or_fileobj=self.pipeline_path,\n path_in_repo=PIPELINE_CONFIG_FILENAME,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n if self.log_filename_path:\n # The same we had with \"pipeline.yaml\" but with the log file.\n HfApi().upload_file(\n path_or_fileobj=self.log_filename_path,\n path_in_repo=PIPELINE_LOG_FILENAME,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n )\n\n def train_test_split(\n self,\n train_size: float,\n shuffle: bool = True,\n seed: Optional[int] = None,\n ) -> Self:\n \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n Splits are created from the dataset according to `train_size` and `shuffle`.\n\n Args:\n train_size:\n Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n It will be applied to all the datasets in the `Distiset`.\n shuffle: Whether or not to shuffle the data before splitting\n seed:\n A seed to initialize the default BitGenerator, passed to the underlying method.\n\n Returns:\n The `Distiset` with the train-test split applied to all the datasets.\n \"\"\"\n assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n for name, dataset in self.items():\n self[name] = dataset.train_test_split(\n train_size=train_size,\n shuffle=shuffle,\n seed=seed,\n )\n return self\n\n def save_to_disk(\n self,\n distiset_path: PathLike,\n max_shard_size: Optional[Union[str, int]] = None,\n num_shards: Optional[int] = None,\n num_proc: Optional[int] = None,\n storage_options: Optional[dict] = None,\n save_card: bool = True,\n save_pipeline_config: bool = True,\n save_pipeline_log: bool = True,\n ) -> None:\n r\"\"\"\n Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n Args:\n distiset_path: Path where you want to save the `Distiset`. It can be a local path\n (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n Defaults to `None`.\n num_shards: Number of shards to write. By default the number of shards depends on\n `max_shard_size` and `num_proc`. Defaults to `None`.\n num_proc: Number of processes when downloading and generating the dataset locally.\n Multiprocessing is disabled by default. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n save_card: Whether to save the dataset card. Defaults to `True`.\n save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n Defaults to `True`.\n save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n Defaults to `True`.\n\n Examples:\n ```python\n # Save your distiset in a local folder:\n distiset.save_to_disk(distiset_path=\"my-distiset\")\n # Save your distiset in a remote storage:\n storage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n }\n distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n ```\n \"\"\"\n distiset_path = str(distiset_path)\n for name, dataset in self.items():\n dataset.save_to_disk(\n f\"{distiset_path}/{name}\",\n max_shard_size=max_shard_size,\n num_shards=num_shards,\n num_proc=num_proc,\n storage_options=storage_options,\n )\n\n distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n fs: fsspec.AbstractFileSystem\n fs, _, _ = fsspec.get_fs_token_paths(\n distiset_config_folder, storage_options=storage_options\n )\n fs.makedirs(distiset_config_folder, exist_ok=True)\n\n if self.artifacts_path:\n distiset_artifacts_folder = posixpath.join(\n distiset_path, DISTISET_ARTIFACTS_FOLDER\n )\n fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n if save_card:\n # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n # as we aren't generating the README copying/updating the data from the dataset repo.\n card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n if storage_options:\n # Write the card the same way as DatasetCard.save does:\n with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n f.write(str(card))\n else:\n card.save(new_filename)\n\n # Write our internal files to the distiset folder by copying them to the distiset folder.\n if save_pipeline_config and self.pipeline_path:\n new_filename = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n data = yaml.safe_load(self.pipeline_path.read_text())\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n yaml.dump(data, f, default_flow_style=False)\n\n if save_pipeline_log and self.log_filename_path:\n new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n data = self.log_filename_path.read_text()\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n f.write(data)\n\n @classmethod\n def load_from_disk(\n cls,\n distiset_path: PathLike,\n keep_in_memory: Optional[bool] = None,\n storage_options: Optional[Dict[str, Any]] = None,\n download_dir: Optional[PathLike] = None,\n ) -> Self:\n \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n Args:\n distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n for more information. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n download_dir: Optional directory to download the dataset to. Defaults to None,\n in which case it will create a temporary directory.\n\n Returns:\n A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n \"\"\"\n original_distiset_path = str(distiset_path)\n\n fs: fsspec.AbstractFileSystem\n fs, _, [distiset_path] = fsspec.get_fs_token_paths( # type: ignore\n original_distiset_path, storage_options=storage_options\n )\n dest_distiset_path = distiset_path\n\n assert fs.isdir(\n original_distiset_path\n ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n has_config = False\n has_artifacts = False\n distiset = cls()\n\n if is_remote_filesystem(fs):\n src_dataset_path = distiset_path\n if download_dir:\n dest_distiset_path = download_dir\n else:\n dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path) # type: ignore\n fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True) # type: ignore\n\n # Now we should have the distiset locally, so we can read those files\n for folder in Path(dest_distiset_path).iterdir():\n if folder.stem == DISTISET_CONFIG_FOLDER:\n has_config = True\n continue\n elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n has_artifacts = True\n continue\n distiset[folder.stem] = load_from_disk(\n str(folder),\n keep_in_memory=keep_in_memory,\n )\n\n # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n # to wherever they are.\n if has_config:\n distiset_config_folder = posixpath.join(\n dest_distiset_path, DISTISET_CONFIG_FOLDER\n )\n\n pipeline_path = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if Path(pipeline_path).exists():\n distiset.pipeline_path = Path(pipeline_path)\n\n log_filename_path = posixpath.join(\n distiset_config_folder, PIPELINE_LOG_FILENAME\n )\n if Path(log_filename_path).exists():\n distiset.log_filename_path = Path(log_filename_path)\n\n if has_artifacts:\n distiset.artifacts_path = Path(\n posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n )\n\n return distiset\n\n @property\n def pipeline_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the `pipeline.yaml` file that generated the `Pipeline`.\"\"\"\n return self._pipeline_path\n\n @pipeline_path.setter\n def pipeline_path(self, path: PathLike) -> None:\n self._pipeline_path = Path(path)\n\n @property\n def artifacts_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the directory containing the artifacts generated by the steps\n of the pipeline.\"\"\"\n return self._artifacts_path\n\n @artifacts_path.setter\n def artifacts_path(self, path: PathLike) -> None:\n self._artifacts_path = Path(path)\n\n @property\n def log_filename_path(self) -> Union[Path, None]:\n \"\"\"Returns the path to the `pipeline.log` file that generated the `Pipeline`.\"\"\"\n return self._log_filename_path\n\n @log_filename_path.setter\n def log_filename_path(self, path: PathLike) -> None:\n self._log_filename_path = Path(path)\n\n @property\n def citations(self) -> Union[List[str], None]:\n \"\"\"Bibtex references to be included in the README.\"\"\"\n return self._citations\n\n @citations.setter\n def citations(self, citations_: List[str]) -> None:\n self._citations = sorted(set(citations_))\n\n def __repr__(self):\n # Copy from `datasets.DatasetDict.__repr__`.\n repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n return f\"Distiset({{\\n{repr}\\n}})\"\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.pipeline_path","title":"pipeline_path: Union[Path, None] property writable ","text":"Returns the path to the pipeline.yaml file that generated the Pipeline . "},{"location":"api/distiset/#distilabel.distiset.Distiset.artifacts_path","title":"artifacts_path: Union[Path, None] property writable ","text":"Returns the path to the directory containing the artifacts generated by the steps of the pipeline. "},{"location":"api/distiset/#distilabel.distiset.Distiset.log_filename_path","title":"log_filename_path: Union[Path, None] property writable ","text":"Returns the path to the pipeline.log file that generated the Pipeline . "},{"location":"api/distiset/#distilabel.distiset.Distiset.citations","title":"citations: Union[List[str], None] property writable ","text":"Bibtex references to be included in the README. "},{"location":"api/distiset/#distilabel.distiset.Distiset.push_to_hub","title":"push_to_hub(repo_id, private=False, token=None, generate_card=True, include_script=False, **kwargs) ","text":"Pushes the Distiset to the Hugging Face Hub, each dataset will be pushed as a different configuration corresponding to the leaf step that generated it. Parameters: Name Type Description Default repo_id str The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name> . Also accepts <dataset_name> , which will default to the namespace of the logged-in user. required private bool Whether the dataset repository should be set to private or not. Only affects repository creation: a repository that already exists will not be affected by that parameter. False token Optional[str] An optional authentication token for the Hugging Face Hub. If no token is passed, will default to the token saved locally when logging in with huggingface-cli login . Will raise an error if no token is passed and the user is not logged-in. None generate_card bool Whether to generate a dataset card or not. Defaults to True. True include_script bool Whether you want to push the pipeline script to the hugging face hub to share it. If set to True, the name of the script that was run to create the distiset will be automatically determined, and that will be the name of the file uploaded to your repository. Take into account, this operation only makes sense for a distiset obtained from calling Pipeline.run() method. Defaults to False. False **kwargs Any Additional keyword arguments to pass to the push_to_hub method of the datasets.Dataset object. {} Raises: Type Description ValueError If no token is provided and couldn't be retrieved automatically. Source code in src/distilabel/distiset.py def push_to_hub(\n self,\n repo_id: str,\n private: bool = False,\n token: Optional[str] = None,\n generate_card: bool = True,\n include_script: bool = False,\n **kwargs: Any,\n) -> None:\n \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n corresponding to the leaf step that generated it.\n\n Args:\n repo_id:\n The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n of the logged-in user.\n private:\n Whether the dataset repository should be set to private or not. Only affects repository creation:\n a repository that already exists will not be affected by that parameter.\n token:\n An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n if no token is passed and the user is not logged-in.\n generate_card:\n Whether to generate a dataset card or not. Defaults to True.\n include_script:\n Whether you want to push the pipeline script to the hugging face hub to share it.\n If set to True, the name of the script that was run to create the distiset will be\n automatically determined, and that will be the name of the file uploaded to your\n repository. Take into account, this operation only makes sense for a distiset obtained\n from calling `Pipeline.run()` method. Defaults to False.\n **kwargs:\n Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n Raises:\n ValueError: If no token is provided and couldn't be retrieved automatically.\n \"\"\"\n script_filename = sys.argv[0]\n filename_py = (\n script_filename.split(\"/\")[-1]\n if \"/\" in script_filename\n else script_filename\n )\n script_path = Path.cwd() / script_filename\n\n if token is None:\n token = get_hf_token(self.__class__.__name__, \"token\")\n\n for name, dataset in self.items():\n dataset.push_to_hub(\n repo_id=repo_id,\n config_name=name,\n private=private,\n token=token,\n **kwargs,\n )\n\n if self.artifacts_path:\n upload_folder(\n repo_id=repo_id,\n folder_path=self.artifacts_path,\n path_in_repo=\"artifacts\",\n token=token,\n repo_type=\"dataset\",\n commit_message=\"Include pipeline artifacts\",\n )\n\n if include_script and script_path.exists():\n upload_file(\n path_or_fileobj=script_path,\n path_in_repo=filename_py,\n repo_id=repo_id,\n repo_type=\"dataset\",\n token=token,\n commit_message=\"Include pipeline script\",\n )\n\n if generate_card:\n self._generate_card(\n repo_id, token, include_script=include_script, filename_py=filename_py\n )\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.train_test_split","title":"train_test_split(train_size, shuffle=True, seed=None) ","text":"Return a Distiset whose values will be a datasets.DatasetDict with two random train and test subsets. Splits are created from the dataset according to train_size and shuffle . Parameters: Name Type Description Default train_size float Float between 0.0 and 1.0 representing the proportion of the dataset to include in the test split. It will be applied to all the datasets in the Distiset . required shuffle bool Whether or not to shuffle the data before splitting True seed Optional[int] A seed to initialize the default BitGenerator, passed to the underlying method. None Returns: Type Description Self The Distiset with the train-test split applied to all the datasets. Source code in src/distilabel/distiset.py def train_test_split(\n self,\n train_size: float,\n shuffle: bool = True,\n seed: Optional[int] = None,\n) -> Self:\n \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n Splits are created from the dataset according to `train_size` and `shuffle`.\n\n Args:\n train_size:\n Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n It will be applied to all the datasets in the `Distiset`.\n shuffle: Whether or not to shuffle the data before splitting\n seed:\n A seed to initialize the default BitGenerator, passed to the underlying method.\n\n Returns:\n The `Distiset` with the train-test split applied to all the datasets.\n \"\"\"\n assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n for name, dataset in self.items():\n self[name] = dataset.train_test_split(\n train_size=train_size,\n shuffle=shuffle,\n seed=seed,\n )\n return self\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.save_to_disk","title":"save_to_disk(distiset_path, max_shard_size=None, num_shards=None, num_proc=None, storage_options=None, save_card=True, save_pipeline_config=True, save_pipeline_log=True) ","text":"Saves a Distiset to a dataset directory, or in a filesystem using any implementation of fsspec.spec.AbstractFileSystem . In case you want to save the Distiset in a remote filesystem, you can pass the storage_options parameter as you would do with datasets 's Dataset.save_to_disk method: see example Parameters: Name Type Description Default distiset_path PathLike Path where you want to save the Distiset . It can be a local path (e.g. dataset/train ) or remote URI (e.g. s3://my-bucket/dataset/train ) required max_shard_size Optional[Union[str, int]] The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit (like \"50MB\" ). Defaults to None . None num_shards Optional[int] Number of shards to write. By default the number of shards depends on max_shard_size and num_proc . Defaults to None . None num_proc Optional[int] Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. Defaults to None . None storage_options Optional[dict] Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . None save_card bool Whether to save the dataset card. Defaults to True . True save_pipeline_config bool Whether to save the pipeline configuration file (aka the pipeline.yaml file). Defaults to True . True save_pipeline_log bool Whether to save the pipeline log file (aka the pipeline.log file). Defaults to True . True Examples: # Save your distiset in a local folder:\ndistiset.save_to_disk(distiset_path=\"my-distiset\")\n# Save your distiset in a remote storage:\nstorage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n}\ndistiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n Source code in src/distilabel/distiset.py def save_to_disk(\n self,\n distiset_path: PathLike,\n max_shard_size: Optional[Union[str, int]] = None,\n num_shards: Optional[int] = None,\n num_proc: Optional[int] = None,\n storage_options: Optional[dict] = None,\n save_card: bool = True,\n save_pipeline_config: bool = True,\n save_pipeline_log: bool = True,\n) -> None:\n r\"\"\"\n Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n Args:\n distiset_path: Path where you want to save the `Distiset`. It can be a local path\n (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n Defaults to `None`.\n num_shards: Number of shards to write. By default the number of shards depends on\n `max_shard_size` and `num_proc`. Defaults to `None`.\n num_proc: Number of processes when downloading and generating the dataset locally.\n Multiprocessing is disabled by default. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n save_card: Whether to save the dataset card. Defaults to `True`.\n save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n Defaults to `True`.\n save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n Defaults to `True`.\n\n Examples:\n ```python\n # Save your distiset in a local folder:\n distiset.save_to_disk(distiset_path=\"my-distiset\")\n # Save your distiset in a remote storage:\n storage_options = {\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n \"client_kwargs\": {\n \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n \"region_name\": os.environ[\"S3_REGION\"],\n },\n }\n distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n ```\n \"\"\"\n distiset_path = str(distiset_path)\n for name, dataset in self.items():\n dataset.save_to_disk(\n f\"{distiset_path}/{name}\",\n max_shard_size=max_shard_size,\n num_shards=num_shards,\n num_proc=num_proc,\n storage_options=storage_options,\n )\n\n distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n fs: fsspec.AbstractFileSystem\n fs, _, _ = fsspec.get_fs_token_paths(\n distiset_config_folder, storage_options=storage_options\n )\n fs.makedirs(distiset_config_folder, exist_ok=True)\n\n if self.artifacts_path:\n distiset_artifacts_folder = posixpath.join(\n distiset_path, DISTISET_ARTIFACTS_FOLDER\n )\n fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n if save_card:\n # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n # as we aren't generating the README copying/updating the data from the dataset repo.\n card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n if storage_options:\n # Write the card the same way as DatasetCard.save does:\n with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n f.write(str(card))\n else:\n card.save(new_filename)\n\n # Write our internal files to the distiset folder by copying them to the distiset folder.\n if save_pipeline_config and self.pipeline_path:\n new_filename = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n data = yaml.safe_load(self.pipeline_path.read_text())\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n yaml.dump(data, f, default_flow_style=False)\n\n if save_pipeline_log and self.log_filename_path:\n new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n data = self.log_filename_path.read_text()\n with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n f.write(data)\n "},{"location":"api/distiset/#distilabel.distiset.Distiset.load_from_disk","title":"load_from_disk(distiset_path, keep_in_memory=None, storage_options=None, download_dir=None) classmethod ","text":"Loads a dataset that was previously saved using Distiset.save_to_disk from a dataset directory, or from a filesystem using any implementation of fsspec.spec.AbstractFileSystem . Parameters: Name Type Description Default distiset_path PathLike Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\"). required keep_in_memory Optional[bool] Whether to copy the dataset in-memory, see datasets.Dataset.load_from_disk`` for more information. Defaults to None`. None storage_options Optional[Dict[str, Any]] Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . None download_dir Optional[PathLike] Optional directory to download the dataset to. Defaults to None, in which case it will create a temporary directory. None Returns: Type Description Self A Distiset loaded from disk, it should be a Distiset object created using Distiset.save_to_disk . Source code in src/distilabel/distiset.py @classmethod\ndef load_from_disk(\n cls,\n distiset_path: PathLike,\n keep_in_memory: Optional[bool] = None,\n storage_options: Optional[Dict[str, Any]] = None,\n download_dir: Optional[PathLike] = None,\n) -> Self:\n \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n Args:\n distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n for more information. Defaults to `None`.\n storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n download_dir: Optional directory to download the dataset to. Defaults to None,\n in which case it will create a temporary directory.\n\n Returns:\n A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n \"\"\"\n original_distiset_path = str(distiset_path)\n\n fs: fsspec.AbstractFileSystem\n fs, _, [distiset_path] = fsspec.get_fs_token_paths( # type: ignore\n original_distiset_path, storage_options=storage_options\n )\n dest_distiset_path = distiset_path\n\n assert fs.isdir(\n original_distiset_path\n ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n has_config = False\n has_artifacts = False\n distiset = cls()\n\n if is_remote_filesystem(fs):\n src_dataset_path = distiset_path\n if download_dir:\n dest_distiset_path = download_dir\n else:\n dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path) # type: ignore\n fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True) # type: ignore\n\n # Now we should have the distiset locally, so we can read those files\n for folder in Path(dest_distiset_path).iterdir():\n if folder.stem == DISTISET_CONFIG_FOLDER:\n has_config = True\n continue\n elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n has_artifacts = True\n continue\n distiset[folder.stem] = load_from_disk(\n str(folder),\n keep_in_memory=keep_in_memory,\n )\n\n # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n # to wherever they are.\n if has_config:\n distiset_config_folder = posixpath.join(\n dest_distiset_path, DISTISET_CONFIG_FOLDER\n )\n\n pipeline_path = posixpath.join(\n distiset_config_folder, PIPELINE_CONFIG_FILENAME\n )\n if Path(pipeline_path).exists():\n distiset.pipeline_path = Path(pipeline_path)\n\n log_filename_path = posixpath.join(\n distiset_config_folder, PIPELINE_LOG_FILENAME\n )\n if Path(log_filename_path).exists():\n distiset.log_filename_path = Path(log_filename_path)\n\n if has_artifacts:\n distiset.artifacts_path = Path(\n posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n )\n\n return distiset\n "},{"location":"api/distiset/#distilabel.distiset.create_distiset","title":"create_distiset(data_dir, pipeline_path=None, log_filename_path=None, enable_metadata=False, dag=None) ","text":"Creates a Distiset from the buffer folder. This function is intended to be used as a helper to create a Distiset from from the folder where the cached data was written by the _WriteBuffer . Parameters: Name Type Description Default data_dir Path Folder where the data buffers were written by the _WriteBuffer . It should correspond to CacheLocation.data . required pipeline_path Optional[Path] Optional path to the pipeline.yaml file that generated the dataset. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.yaml file to the repo upon Distiset.push_to_hub . None log_filename_path Optional[Path] Optional path to the pipeline.log file that was generated during the pipeline run. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.log file to the repo upon Distiset.push_to_hub . None enable_metadata bool Whether to include the distilabel metadata column in the dataset or not. Defaults to False . False dag Optional[DAG] DAG contained in a Pipeline . If informed, will be used to extract the references/ citations from it. None Returns: Type Description Distiset The dataset created from the buffer folder, where the different leaf steps will Distiset correspond to different configurations of the dataset. Examples: from pathlib import Path\ndistiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n Source code in src/distilabel/distiset.py def create_distiset( # noqa: C901\n data_dir: Path,\n pipeline_path: Optional[Path] = None,\n log_filename_path: Optional[Path] = None,\n enable_metadata: bool = False,\n dag: Optional[\"DAG\"] = None,\n) -> Distiset:\n \"\"\"Creates a `Distiset` from the buffer folder.\n\n This function is intended to be used as a helper to create a `Distiset` from from the folder\n where the cached data was written by the `_WriteBuffer`.\n\n Args:\n data_dir: Folder where the data buffers were written by the `_WriteBuffer`.\n It should correspond to `CacheLocation.data`.\n pipeline_path: Optional path to the pipeline.yaml file that generated the dataset.\n Internally this will be passed to the `Distiset` object on creation to allow\n uploading the `pipeline.yaml` file to the repo upon `Distiset.push_to_hub`.\n log_filename_path: Optional path to the pipeline.log file that was generated during the pipeline run.\n Internally this will be passed to the `Distiset` object on creation to allow\n uploading the `pipeline.log` file to the repo upon `Distiset.push_to_hub`.\n enable_metadata: Whether to include the distilabel metadata column in the dataset or not.\n Defaults to `False`.\n dag: DAG contained in a `Pipeline`. If informed, will be used to extract the references/\n citations from it.\n\n Returns:\n The dataset created from the buffer folder, where the different leaf steps will\n correspond to different configurations of the dataset.\n\n Examples:\n ```python\n from pathlib import Path\n distiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n ```\n \"\"\"\n from distilabel.constants import DISTILABEL_METADATA_KEY\n\n logger = logging.getLogger(\"distilabel.distiset\")\n\n steps_outputs_dir = data_dir / STEPS_OUTPUTS_PATH\n\n distiset = Distiset()\n for file in steps_outputs_dir.iterdir():\n if file.is_file():\n continue\n\n files = [str(file) for file in list_files_in_dir(file)]\n if files:\n try:\n ds = load_dataset(\n \"parquet\", name=file.stem, data_files={\"train\": files}\n )\n if not enable_metadata and DISTILABEL_METADATA_KEY in ds.column_names:\n ds = ds.remove_columns(DISTILABEL_METADATA_KEY)\n distiset[file.stem] = ds\n except ArrowInvalid:\n logger.warning(f\"\u274c Failed to load the subset from '{file}' directory.\")\n continue\n else:\n logger.warning(\n f\"No output files for step '{file.stem}', can't create a dataset.\"\n \" Did the step produce any data?\"\n )\n\n # If there's only one dataset i.e. one config, then set the config name to `default`\n if len(distiset.keys()) == 1:\n distiset[\"default\"] = distiset.pop(list(distiset.keys())[0])\n\n # If there's any artifact set the `artifacts_path` so they can be uploaded\n steps_artifacts_dir = data_dir / STEPS_ARTIFACTS_PATH\n if any(steps_artifacts_dir.rglob(\"*\")):\n distiset.artifacts_path = steps_artifacts_dir\n\n # Include `pipeline.yaml` if exists\n if pipeline_path:\n distiset.pipeline_path = pipeline_path\n else:\n # If the pipeline path is not provided, try to find it in the parent directory\n # and assume that's the wanted file.\n pipeline_path = steps_outputs_dir.parent / \"pipeline.yaml\"\n if pipeline_path.exists():\n distiset.pipeline_path = pipeline_path\n\n # Include `pipeline.log` if exists\n if log_filename_path:\n distiset.log_filename_path = log_filename_path\n else:\n log_filename_path = steps_outputs_dir.parent / \"pipeline.log\"\n if log_filename_path.exists():\n distiset.log_filename_path = log_filename_path\n\n if dag:\n distiset._citations = _grab_citations(dag)\n\n return distiset\n "},{"location":"api/errors/","title":"Errors","text":"This section contains the distilabel custom errors. Unlike exceptions, errors in distilabel are used to handle unexpected situations that can't be anticipated and that can't be handled in a controlled way. "},{"location":"api/errors/#distilabel.errors.DistilabelError","title":"DistilabelError ","text":"A mixin class for common functionality shared by all Distilabel-specific errors. Attributes: Name Type Description message A message describing the error. page An optional error code from PydanticErrorCodes enum. Examples: raise DistilabelUserError(\"This is an error message.\")\nThis is an error message.\n\nraise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\nThis is an error message.\nFor further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n Source code in src/distilabel/errors.py class DistilabelError:\n \"\"\"A mixin class for common functionality shared by all Distilabel-specific errors.\n\n Attributes:\n message: A message describing the error.\n page: An optional error code from PydanticErrorCodes enum.\n\n Examples:\n ```python\n raise DistilabelUserError(\"This is an error message.\")\n This is an error message.\n\n raise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\n This is an error message.\n For further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n ```\n \"\"\"\n\n def __init__(self, message: str, *, page: Optional[str] = None) -> None:\n self.message = message\n self.page = page\n\n def __str__(self) -> str:\n if self.page is None:\n return self.message\n else:\n return f\"{self.message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}{self.page}'\"\n "},{"location":"api/errors/#distilabel.errors.DistilabelUserError","title":"DistilabelUserError ","text":" Bases: DistilabelError , ValueError ValueError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelUserError(DistilabelError, ValueError):\n \"\"\"ValueError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/errors/#distilabel.errors.DistilabelTypeError","title":"DistilabelTypeError ","text":" Bases: DistilabelError , TypeError TypeError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelTypeError(DistilabelError, TypeError):\n \"\"\"TypeError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/errors/#distilabel.errors.DistilabelNotImplementedError","title":"DistilabelNotImplementedError ","text":" Bases: DistilabelError , NotImplementedError NotImplementedError that we can redirect to a given page in the documentation. Source code in src/distilabel/errors.py class DistilabelNotImplementedError(DistilabelError, NotImplementedError):\n \"\"\"NotImplementedError that we can redirect to a given page in the documentation.\"\"\"\n\n pass\n "},{"location":"api/exceptions/","title":"Exceptions","text":"This section contains the distilabel custom exceptions. Unlike errors, exceptions in distilabel are used to handle specific situations that can be anticipated and that can be handled in a controlled way internally by the library. "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelException","title":"DistilabelException ","text":" Bases: Exception Base exception (can be gracefully handled) for distilabel framework. Source code in src/distilabel/exceptions.py class DistilabelException(Exception):\n \"\"\"Base exception (can be gracefully handled) for `distilabel` framework.\"\"\"\n\n pass\n "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelGenerationException","title":"DistilabelGenerationException ","text":" Bases: DistilabelException Base exception for LLM generation errors. Source code in src/distilabel/exceptions.py class DistilabelGenerationException(DistilabelException):\n \"\"\"Base exception for `LLM` generation errors.\"\"\"\n\n pass\n "},{"location":"api/exceptions/#distilabel.exceptions.DistilabelOfflineBatchGenerationNotFinishedException","title":"DistilabelOfflineBatchGenerationNotFinishedException ","text":" Bases: DistilabelGenerationException Exception raised when a batch generation is not finished. Source code in src/distilabel/exceptions.py class DistilabelOfflineBatchGenerationNotFinishedException(\n DistilabelGenerationException\n):\n \"\"\"Exception raised when a batch generation is not finished.\"\"\"\n\n jobs_ids: Tuple[str, ...]\n\n def __init__(self, jobs_ids: Tuple[str, ...]) -> None:\n self.jobs_ids = jobs_ids\n super().__init__(f\"Batch generation with jobs_ids={jobs_ids} is not finished\")\n "},{"location":"api/mixins/requirements/","title":"RequirementsMixin","text":""},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin","title":"RequirementsMixin ","text":"Mixin for classes that have requirements attribute. Used to add requirements to a Step and a Pipeline . Source code in src/distilabel/mixins/requirements.py class RequirementsMixin:\n \"\"\"Mixin for classes that have `requirements` attribute.\n\n Used to add requirements to a `Step` and a `Pipeline`.\n \"\"\"\n\n _requirements: Union[List[Requirement], None] = []\n\n def _gather_requirements(self) -> List[str]:\n \"\"\"This method will be overwritten in the `BasePipeline` class to gather the requirements\n from each step.\n \"\"\"\n return []\n\n @property\n def requirements(self) -> List[str]:\n \"\"\"Return a list of requirements that must be installed to run the `Pipeline`.\n\n The requirements in a Pipeline will include the requirements from all the steps (if any).\n\n Returns:\n List of requirements that must be installed to run the `Pipeline`, sorted alphabetically.\n \"\"\"\n self.requirements = self._gather_requirements()\n\n return [str(r) for r in self._requirements]\n\n @requirements.setter\n def requirements(self, _requirements: List[str]) -> None:\n requirements = []\n if not isinstance(_requirements, list):\n _requirements = [_requirements]\n\n for r in _requirements:\n try:\n requirements.append(Requirement(r))\n except InvalidRequirement:\n self._logger.warning(f\"Invalid requirement: `{r}`\")\n\n self._requirements = sorted(\n set(self._requirements).union(set(requirements)), key=lambda x: str(x)\n )\n\n def requirements_to_install(self) -> List[str]:\n \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n Returns:\n List of requirements required to run the pipeline that are not installed in the current environment.\n \"\"\"\n\n to_install = []\n for req in self.requirements:\n requirement = Requirement(req)\n if importlib.util.find_spec(requirement.name):\n if (str(requirement.specifier) != \"\") and (\n version(requirement.name) != str(requirement.specifier)\n ):\n to_install.append(req)\n else:\n to_install.append(req)\n return to_install\n "},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements","title":"requirements: List[str] property writable ","text":"Return a list of requirements that must be installed to run the Pipeline . The requirements in a Pipeline will include the requirements from all the steps (if any). Returns: Type Description List[str] List of requirements that must be installed to run the Pipeline , sorted alphabetically. "},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements_to_install","title":"requirements_to_install() ","text":"Check if the requirements are installed in the current environment, and returns the ones that aren't. Returns: Type Description List[str] List of requirements required to run the pipeline that are not installed in the current environment. Source code in src/distilabel/mixins/requirements.py def requirements_to_install(self) -> List[str]:\n \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n Returns:\n List of requirements required to run the pipeline that are not installed in the current environment.\n \"\"\"\n\n to_install = []\n for req in self.requirements:\n requirement = Requirement(req)\n if importlib.util.find_spec(requirement.name):\n if (str(requirement.specifier) != \"\") and (\n version(requirement.name) != str(requirement.specifier)\n ):\n to_install.append(req)\n else:\n to_install.append(req)\n return to_install\n "},{"location":"api/mixins/runtime_parameters/","title":"RuntimeParametersMixin","text":""},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin","title":"RuntimeParametersMixin ","text":" Bases: BaseModel Mixin for classes that have RuntimeParameter s attributes. Attributes: Name Type Description _runtime_parameters Dict[str, Any] A dictionary containing the values of the runtime parameters of the class. This attribute is meant to be used internally and should not be accessed directly. Source code in src/distilabel/mixins/runtime_parameters.py class RuntimeParametersMixin(BaseModel):\n \"\"\"Mixin for classes that have `RuntimeParameter`s attributes.\n\n Attributes:\n _runtime_parameters: A dictionary containing the values of the runtime parameters\n of the class. This attribute is meant to be used internally and should not be\n accessed directly.\n \"\"\"\n\n _runtime_parameters: Dict[str, Any] = PrivateAttr(default_factory=dict)\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns a dictionary containing the name of the runtime parameters of the class\n as keys and whether the parameter is required or not as values.\n\n Returns:\n A dictionary containing the name of the runtime parameters of the class as keys\n and whether the parameter is required or not as values.\n \"\"\"\n\n runtime_parameters = {}\n\n for name, field_info in self.model_fields.items(): # type: ignore\n # `field: RuntimeParameter[Any]` or `field: Optional[RuntimeParameter[Any]]`\n is_runtime_param, is_optional = _is_runtime_parameter(field_info)\n if is_runtime_param:\n runtime_parameters[name] = is_optional\n continue\n\n attr = getattr(self, name)\n\n # `field: RuntimeParametersMixin`\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters[name] = attr.runtime_parameters_names\n\n # `field: List[RuntimeParametersMixin]`\n if (\n isinstance(attr, list)\n and attr\n and isinstance(attr[0], RuntimeParametersMixin)\n ):\n runtime_parameters[name] = {\n str(i): item.runtime_parameters_names for i, item in enumerate(attr)\n }\n\n return runtime_parameters\n\n def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the class such as the name and\n the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the class.\n\n Returns:\n A list containing the information for each runtime parameter of the class.\n \"\"\"\n runtime_parameters_info = []\n for name, field_info in self.model_fields.items(): # type: ignore\n if name not in self.runtime_parameters_names:\n continue\n\n attr = getattr(self, name)\n\n # Get runtime parameters info for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n }\n )\n continue\n\n # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": {\n str(i): item.get_runtime_parameters_info()\n for i, item in enumerate(attr)\n },\n }\n )\n continue\n\n info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n if field_info.description is not None:\n info[\"description\"] = field_info.description\n runtime_parameters_info.append(info)\n return runtime_parameters_info\n\n def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n the attr.\n\n Args:\n runtime_parameters: A dictionary containing the values of the runtime parameters\n to set.\n \"\"\"\n runtime_parameters_names = list(self.runtime_parameters_names.keys())\n for name, value in runtime_parameters.items():\n if name not in self.runtime_parameters_names:\n # Check done just to ensure the unit tests for the mixin run\n if getattr(self, \"pipeline\", None):\n closest = difflib.get_close_matches(\n name, runtime_parameters_names, cutoff=0.5\n )\n msg = (\n f\"\u26a0\ufe0f Runtime parameter '{name}' unknown in step '{self.name}'.\" # type: ignore\n )\n if closest:\n msg += f\" Did you mean any of: {closest}\"\n else:\n msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n self.pipeline._logger.warning(msg) # type: ignore\n continue\n\n attr = getattr(self, name)\n\n # Set runtime parameters for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n attr.set_runtime_parameters(value)\n self._runtime_parameters[name] = value\n continue\n\n # Set runtime parameters for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n for i, item in enumerate(attr):\n item_value = value.get(str(i), {})\n item.set_runtime_parameters(item_value)\n self._runtime_parameters[name] = value\n continue\n\n # Handle settings values for `_SecretField`\n field_info = self.model_fields[name]\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type):\n value = inner_type(value)\n\n # Set the value of the runtime parameter\n setattr(self, name, value)\n self._runtime_parameters[name] = value\n "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns a dictionary containing the name of the runtime parameters of the class as keys and whether the parameter is required or not as values. Returns: Type Description RuntimeParametersNames A dictionary containing the name of the runtime parameters of the class as keys RuntimeParametersNames and whether the parameter is required or not as values. "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Gets the information of the runtime parameters of the class such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the class. Returns: Type Description List[RuntimeParameterInfo] A list containing the information for each runtime parameter of the class. Source code in src/distilabel/mixins/runtime_parameters.py def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the class such as the name and\n the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the class.\n\n Returns:\n A list containing the information for each runtime parameter of the class.\n \"\"\"\n runtime_parameters_info = []\n for name, field_info in self.model_fields.items(): # type: ignore\n if name not in self.runtime_parameters_names:\n continue\n\n attr = getattr(self, name)\n\n # Get runtime parameters info for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n }\n )\n continue\n\n # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n runtime_parameters_info.append(\n {\n \"name\": name,\n \"runtime_parameters_info\": {\n str(i): item.get_runtime_parameters_info()\n for i, item in enumerate(attr)\n },\n }\n )\n continue\n\n info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n if field_info.description is not None:\n info[\"description\"] = field_info.description\n runtime_parameters_info.append(info)\n return runtime_parameters_info\n "},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.set_runtime_parameters","title":"set_runtime_parameters(runtime_parameters) ","text":"Sets the runtime parameters of the class using the provided values. If the attr to be set is a RuntimeParametersMixin , it will call set_runtime_parameters on the attr. Parameters: Name Type Description Default runtime_parameters Dict[str, Any] A dictionary containing the values of the runtime parameters to set. required Source code in src/distilabel/mixins/runtime_parameters.py def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n the attr.\n\n Args:\n runtime_parameters: A dictionary containing the values of the runtime parameters\n to set.\n \"\"\"\n runtime_parameters_names = list(self.runtime_parameters_names.keys())\n for name, value in runtime_parameters.items():\n if name not in self.runtime_parameters_names:\n # Check done just to ensure the unit tests for the mixin run\n if getattr(self, \"pipeline\", None):\n closest = difflib.get_close_matches(\n name, runtime_parameters_names, cutoff=0.5\n )\n msg = (\n f\"\u26a0\ufe0f Runtime parameter '{name}' unknown in step '{self.name}'.\" # type: ignore\n )\n if closest:\n msg += f\" Did you mean any of: {closest}\"\n else:\n msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n self.pipeline._logger.warning(msg) # type: ignore\n continue\n\n attr = getattr(self, name)\n\n # Set runtime parameters for `RuntimeParametersMixin` field\n if isinstance(attr, RuntimeParametersMixin):\n attr.set_runtime_parameters(value)\n self._runtime_parameters[name] = value\n continue\n\n # Set runtime parameters for `List[RuntimeParametersMixin]` field\n if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n for i, item in enumerate(attr):\n item_value = value.get(str(i), {})\n item.set_runtime_parameters(item_value)\n self._runtime_parameters[name] = value\n continue\n\n # Handle settings values for `_SecretField`\n field_info = self.model_fields[name]\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type):\n value = inner_type(value)\n\n # Set the value of the runtime parameter\n setattr(self, name, value)\n self._runtime_parameters[name] = value\n "},{"location":"api/models/embedding/","title":"Embedding","text":"This section contains the API reference for the distilabel embeddings. For more information on how the Embeddings works and see some examples. "},{"location":"api/models/embedding/#distilabel.models.embeddings.base","title":"base ","text":""},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings","title":"Embeddings ","text":" Bases: RuntimeParametersMixin , BaseModel , _Serializable , ABC Base class for Embeddings models. To implement an Embeddings subclass, you need to subclass this class and implement: - load method to load the Embeddings model. Don't forget to call super().load() , so the _logger attribute is initialized. - model_name property to return the model name used for the Embeddings . - encode method to generate the sentence embeddings. Attributes: Name Type Description _logger Logger the logger to be used for the Embeddings model. It will be initialized when the load method is called. Source code in src/distilabel/models/embeddings/base.py class Embeddings(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n \"\"\"Base class for `Embeddings` models.\n\n To implement an `Embeddings` subclass, you need to subclass this class and implement:\n - `load` method to load the `Embeddings` model. Don't forget to call `super().load()`,\n so the `_logger` attribute is initialized.\n - `model_name` property to return the model name used for the `Embeddings`.\n - `encode` method to generate the sentence embeddings.\n\n Attributes:\n _logger: the logger to be used for the `Embeddings` model. It will be initialized\n when the `load` method is called.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n protected_namespaces=(),\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n def unload(self) -> None:\n \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n pass\n\n @property\n @abstractmethod\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the `Embeddings`.\"\"\"\n pass\n\n @abstractmethod\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n pass\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.model_name","title":"model_name: str abstractmethod property ","text":"Returns the model name used for the Embeddings . "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.load","title":"load() ","text":"Method to be called to initialize the Embeddings Source code in src/distilabel/models/embeddings/base.py def load(self) -> None:\n \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.unload","title":"unload() ","text":"Method to be called to unload the Embeddings and release any resources. Source code in src/distilabel/models/embeddings/base.py def unload(self) -> None:\n \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n pass\n "},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.encode","title":"encode(inputs) abstractmethod ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/base.py @abstractmethod\ndef encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n pass\n "},{"location":"api/models/embedding/embedding_gallery/","title":"Embedding Gallery","text":"This section contains the existing Embeddings subclasses implemented in distilabel . "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings","title":"embeddings ","text":""},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings","title":"SentenceTransformerEmbeddings ","text":" Bases: Embeddings , CudaDevicePlacementMixin sentence-transformers library implementation for embedding generation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. device Optional[RuntimeParameter[str]] the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None . prompts Optional[Dict[str, str]] a dictionary containing prompts to be used with the model. Defaults to None . default_prompt_name Optional[str] the default prompt (in prompts ) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . revision Optional[str] if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . token Optional[str] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . truncate_dim Optional[int] the dimension to truncate the sentence embeddings. Defaults to None . model_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None . tokenizer_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None . config_kwargs Optional[Dict[str, Any]] extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None . precision Optional[Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']] the dtype that will have the resulting embeddings. Defaults to \"float32\" . normalize_embeddings RuntimeParameter[bool] whether to normalize the embeddings so they have a length of 1. Defaults to None . Examples: Generating sentence embeddings: from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n Source code in src/distilabel/models/embeddings/sentence_transformers.py class SentenceTransformerEmbeddings(Embeddings, CudaDevicePlacementMixin):\n \"\"\"`sentence-transformers` library implementation for embedding generation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc.\n Defaults to `None`.\n prompts: a dictionary containing prompts to be used with the model. Defaults to\n `None`.\n default_prompt_name: the default prompt (in `prompts`) that will be applied to the\n inputs. If not provided, then no prompt will be used. Defaults to `None`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n truncate_dim: the dimension to truncate the sentence embeddings. Defaults to `None`.\n model_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n model class. Defaults to `None`.\n tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n tokenizer class. Defaults to `None`.\n config_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n configuration class. Defaults to `None`.\n precision: the dtype that will have the resulting embeddings. Defaults to `\"float32\"`.\n normalize_embeddings: whether to normalize the embeddings so they have a length\n of 1. Defaults to `None`.\n\n Examples:\n Generating sentence embeddings:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n\n embeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\n embeddings.load()\n\n results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n # [\n # [-0.05447685346007347, -0.01623094454407692, ...],\n # [4.4889533455716446e-05, 0.044016145169734955, ...],\n # ]\n ```\n \"\"\"\n\n model: str\n device: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The device to be used to load the model. If `None`, then it\"\n \" will check if a GPU can be used.\",\n )\n prompts: Optional[Dict[str, str]] = None\n default_prompt_name: Optional[str] = None\n trust_remote_code: bool = False\n revision: Optional[str] = None\n token: Optional[str] = None\n truncate_dim: Optional[int] = None\n model_kwargs: Optional[Dict[str, Any]] = None\n tokenizer_kwargs: Optional[Dict[str, Any]] = None\n config_kwargs: Optional[Dict[str, Any]] = None\n precision: Optional[Literal[\"float32\", \"int8\", \"uint8\", \"binary\", \"ubinary\"]] = (\n \"float32\"\n )\n normalize_embeddings: RuntimeParameter[bool] = Field(\n default=True,\n description=\"Whether to normalize the embeddings so the generated vectors\"\n \" have a length of 1 or not.\",\n )\n\n _model: Union[\"SentenceTransformer\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the Sentence Transformer model\"\"\"\n super().load()\n\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from sentence_transformers import SentenceTransformer\n except ImportError as e:\n raise ImportError(\n \"`sentence-transformers` package is not installed. Please install it using\"\n \" `pip install sentence-transformers`.\"\n ) from e\n\n self._model = SentenceTransformer(\n model_name_or_path=self.model,\n device=self.device,\n prompts=self.prompts,\n default_prompt_name=self.default_prompt_name,\n trust_remote_code=self.trust_remote_code,\n revision=self.revision,\n token=self.token,\n truncate_dim=self.truncate_dim,\n model_kwargs=self.model_kwargs,\n tokenizer_kwargs=self.tokenizer_kwargs,\n config_kwargs=self.config_kwargs,\n )\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the name of the model.\"\"\"\n return self.model\n\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return self._model.encode( # type: ignore\n sentences=inputs,\n batch_size=len(inputs),\n convert_to_numpy=True,\n precision=self.precision, # type: ignore\n normalize_embeddings=self.normalize_embeddings, # type: ignore\n ).tolist() # type: ignore\n\n def unload(self) -> None:\n del self._model\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.model_name","title":"model_name: str property ","text":"Returns the name of the model. "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.load","title":"load() ","text":"Loads the Sentence Transformer model Source code in src/distilabel/models/embeddings/sentence_transformers.py def load(self) -> None:\n \"\"\"Loads the Sentence Transformer model\"\"\"\n super().load()\n\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from sentence_transformers import SentenceTransformer\n except ImportError as e:\n raise ImportError(\n \"`sentence-transformers` package is not installed. Please install it using\"\n \" `pip install sentence-transformers`.\"\n ) from e\n\n self._model = SentenceTransformer(\n model_name_or_path=self.model,\n device=self.device,\n prompts=self.prompts,\n default_prompt_name=self.default_prompt_name,\n trust_remote_code=self.trust_remote_code,\n revision=self.revision,\n token=self.token,\n truncate_dim=self.truncate_dim,\n model_kwargs=self.model_kwargs,\n tokenizer_kwargs=self.tokenizer_kwargs,\n config_kwargs=self.config_kwargs,\n )\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.encode","title":"encode(inputs) ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/sentence_transformers.py def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return self._model.encode( # type: ignore\n sentences=inputs,\n batch_size=len(inputs),\n convert_to_numpy=True,\n precision=self.precision, # type: ignore\n normalize_embeddings=self.normalize_embeddings, # type: ignore\n ).tolist() # type: ignore\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings","title":"vLLMEmbeddings ","text":" Bases: Embeddings , CudaDevicePlacementMixin vllm library implementation for embedding generation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. dtype str the data type to use for the model. Defaults to auto . trust_remote_code bool whether to trust the remote code when loading the model. Defaults to False . quantization Optional[str] the quantization mode to use for the model. Defaults to None . revision Optional[str] the revision of the model to load. Defaults to None . enforce_eager bool whether to enforce eager execution. Defaults to True . seed int the seed to use for the random number generator. Defaults to 0 . extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . _model LLM the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. References - Offline inference embeddings
Examples: Generating sentence embeddings: from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n Source code in src/distilabel/models/embeddings/vllm.py class vLLMEmbeddings(Embeddings, CudaDevicePlacementMixin):\n \"\"\"`vllm` library implementation for embedding generation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n dtype: the data type to use for the model. Defaults to `auto`.\n trust_remote_code: whether to trust the remote code when loading the model. Defaults\n to `False`.\n quantization: the quantization mode to use for the model. Defaults to `None`.\n revision: the revision of the model to load. Defaults to `None`.\n enforce_eager: whether to enforce eager execution. Defaults to `True`.\n seed: the seed to use for the random number generator. Defaults to `0`.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `LLM` class of `vllm` library. Defaults to `{}`.\n _model: the `vLLM` model instance. This attribute is meant to be used internally\n and should not be accessed directly. It will be set in the `load` method.\n\n References:\n - [Offline inference embeddings](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_embedding.html)\n\n Examples:\n Generating sentence embeddings:\n\n ```python\n from distilabel.models import vLLMEmbeddings\n\n embeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\n embeddings.load()\n\n results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n # [\n # [-0.05447685346007347, -0.01623094454407692, ...],\n # [4.4889533455716446e-05, 0.044016145169734955, ...],\n # ]\n ```\n \"\"\"\n\n model: str\n dtype: str = \"auto\"\n trust_remote_code: bool = False\n quantization: Optional[str] = None\n revision: Optional[str] = None\n\n enforce_eager: bool = True\n\n seed: int = 0\n\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n )\n\n _model: \"_vLLM\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n enforce_eager=self.enforce_eager,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the name of the model.\"\"\"\n return self.model\n\n def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return [output.outputs.embedding for output in self._model.encode(inputs)]\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.model_name","title":"model_name: str property ","text":"Returns the name of the model. "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.load","title":"load() ","text":"Loads the vLLM model using either the path or the Hugging Face Hub repository id. Source code in src/distilabel/models/embeddings/vllm.py def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n enforce_eager=self.enforce_eager,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/embeddings/vllm.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.encode","title":"encode(inputs) ","text":"Generates embeddings for the provided inputs. Parameters: Name Type Description Default inputs List[str] a list of texts for which an embedding has to be generated. required Returns: Type Description List[List[Union[int, float]]] The generated embeddings. Source code in src/distilabel/models/embeddings/vllm.py def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n \"\"\"Generates embeddings for the provided inputs.\n\n Args:\n inputs: a list of texts for which an embedding has to be generated.\n\n Returns:\n The generated embeddings.\n \"\"\"\n return [output.outputs.embedding for output in self._model.encode(inputs)]\n "},{"location":"api/models/llm/","title":"LLM","text":"This section contains the API reference for the distilabel LLMs, both for the LLM synchronous implementation, and for the AsyncLLM asynchronous one. For more information and examples on how to use existing LLMs or create custom ones, please refer to Tutorial - LLM. "},{"location":"api/models/llm/#distilabel.models.llms.base","title":"base ","text":""},{"location":"api/models/llm/#distilabel.models.llms.base.LLM","title":"LLM ","text":" Bases: RuntimeParametersMixin , BaseModel , _Serializable , ABC Base class for LLM s to be used in distilabel framework. To implement an LLM subclass, you need to subclass this class and implement: - load method to load the LLM if needed. Don't forget to call super().load() , so the _logger attribute is initialized. - model_name property to return the model name used for the LLM. - generate method to generate num_generations per input in inputs . Attributes: Name Type Description generation_kwargs Optional[RuntimeParameter[Dict[str, Any]]] the kwargs to be propagated to either generate or agenerate methods within each LLM . use_offline_batch_generation Optional[RuntimeParameter[bool]] whether to use the offline_batch_generate method to generate the responses. offline_batch_generation_block_until_done Optional[RuntimeParameter[int]] if provided, then polling will be done until the ofline_batch_generate method is able to retrieve the results. The value indicate the time to wait between each polling. jobs_ids Union[Tuple[str, ...], None] the job ids generated by the offline_batch_generate method. This attribute is used to store the job ids generated by the offline_batch_generate method so later they can be used to retrieve the results. It is not meant to be set by the user. _logger Logger the logger to be used for the LLM . It will be initialized when the load method is called. Source code in src/distilabel/models/llms/base.py class LLM(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n \"\"\"Base class for `LLM`s to be used in `distilabel` framework.\n\n To implement an `LLM` subclass, you need to subclass this class and implement:\n - `load` method to load the `LLM` if needed. Don't forget to call `super().load()`,\n so the `_logger` attribute is initialized.\n - `model_name` property to return the model name used for the LLM.\n - `generate` method to generate `num_generations` per input in `inputs`.\n\n Attributes:\n generation_kwargs: the kwargs to be propagated to either `generate` or `agenerate`\n methods within each `LLM`.\n use_offline_batch_generation: whether to use the `offline_batch_generate` method to\n generate the responses.\n offline_batch_generation_block_until_done: if provided, then polling will be done until\n the `ofline_batch_generate` method is able to retrieve the results. The value indicate\n the time to wait between each polling.\n jobs_ids: the job ids generated by the `offline_batch_generate` method. This attribute\n is used to store the job ids generated by the `offline_batch_generate` method\n so later they can be used to retrieve the results. It is not meant to be set by\n the user.\n _logger: the logger to be used for the `LLM`. It will be initialized when the `load`\n method is called.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n protected_namespaces=(),\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n\n generation_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"The kwargs to be propagated to either `generate` or `agenerate`\"\n \" methods within each `LLM`.\",\n )\n use_offline_batch_generation: Optional[RuntimeParameter[bool]] = Field(\n default=False,\n description=\"Whether to use the `offline_batch_generate` method to generate\"\n \" the responses.\",\n )\n offline_batch_generation_block_until_done: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"If provided, then polling will be done until the `ofline_batch_generate`\"\n \" method is able to retrieve the results. The value indicate the time to wait between\"\n \" each polling.\",\n )\n\n jobs_ids: Union[Tuple[str, ...], None] = Field(default=None)\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n structured output generator.\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n def unload(self) -> None:\n \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n pass\n\n @property\n @abstractmethod\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n pass\n\n def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs to be used for the generation. This method can\n be overridden to provide a more complex logic for the generation kwargs.\n\n Returns:\n The kwargs to be used for the generation.\n \"\"\"\n return self.generation_kwargs # type: ignore\n\n @abstractmethod\n def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n per input in `inputs`.\n\n Args:\n inputs: the list of inputs to generate responses for which follows OpenAI's\n API format:\n\n ```python\n [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n ]\n ```\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n \"\"\"\n pass\n\n def generate_outputs(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Generates outputs for the given inputs using either `generate` method or the\n `offine_batch_generate` method if `use_offline_\n \"\"\"\n if self.use_offline_batch_generation:\n if self.offline_batch_generation_block_until_done is not None:\n return self._offline_batch_generate_polling(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n # if the batch generation is not finished.\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n\n def _offline_batch_generate_polling(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to poll the `offline_batch_generate` method until the batch generation\n is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n while True:\n try:\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n except DistilabelOfflineBatchGenerationNotFinishedException as e:\n self._logger.info(\n f\"Waiting for the offline batch generation to finish: {e}. Sleeping\"\n f\" for {self.offline_batch_generation_block_until_done} seconds before\"\n \" trying to get the results again.\"\n )\n # When running a `Step` in a child process, SIGINT is overriden so the child\n # process doesn't stop when the parent process receives a SIGINT signal.\n # The new handler sets an environment variable that is checked here to stop\n # the polling.\n if os.getenv(SIGINT_HANDLER_CALLED_ENV_NAME) is not None:\n self._logger.info(\n \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n \" offline batch generation is finished...\"\n )\n raise e\n time.sleep(self.offline_batch_generation_block_until_done) # type: ignore\n except KeyboardInterrupt as e:\n # This is for the case the `LLM` is being executed outside a pipeline\n self._logger.info(\n \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n \" offline batch generation is finished...\"\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids # type: ignore\n ) from e\n\n @property\n def generate_parameters(self) -> List[\"inspect.Parameter\"]:\n \"\"\"Returns the parameters of the `generate` method.\n\n Returns:\n A list containing the parameters of the `generate` method.\n \"\"\"\n return list(inspect.signature(self.generate).parameters.values())\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns the runtime parameters of the `LLM`, which are combination of the\n attributes of the `LLM` type hinted with `RuntimeParameter` and the parameters\n of the `generate` method that are not `input` and `num_generations`.\n\n Returns:\n A dictionary with the name of the runtime parameters as keys and a boolean\n indicating if the parameter is optional or not.\n \"\"\"\n runtime_parameters = super().runtime_parameters_names\n runtime_parameters[\"generation_kwargs\"] = {}\n\n # runtime parameters from the `generate` method\n for param in self.generate_parameters:\n if param.name in [\"input\", \"inputs\", \"num_generations\"]:\n continue\n is_optional = param.default != inspect.Parameter.empty\n runtime_parameters[\"generation_kwargs\"][param.name] = is_optional\n\n return runtime_parameters\n\n def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n and the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the `LLM`.\n\n Returns:\n A list containing the information for each runtime parameter of the `LLM`.\n \"\"\"\n runtime_parameters_info = super().get_runtime_parameters_info()\n\n generation_kwargs_info = next(\n (\n runtime_parameter_info\n for runtime_parameter_info in runtime_parameters_info\n if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n ),\n None,\n )\n\n # If `generation_kwargs` attribute is present, we need to include the `generate`\n # method arguments as the information for this attribute.\n if generation_kwargs_info:\n generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n generation_kwargs_info[\"keys\"] = []\n for key, value in generation_kwargs_info[\"optional\"].items():\n info = {\"name\": key, \"optional\": value}\n if description := generate_docstring_args.get(key):\n info[\"description\"] = description\n generation_kwargs_info[\"keys\"].append(info)\n\n generation_kwargs_info.pop(\"optional\")\n\n return runtime_parameters_info\n\n @cached_property\n def generate_parsed_docstring(self) -> \"Docstring\":\n \"\"\"Returns the parsed docstring of the `generate` method.\n\n Returns:\n The parsed docstring of the `generate` method.\n \"\"\"\n return parse_google_docstring(self.generate)\n\n def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n ) -> List[\"HiddenState\"]:\n \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n Args:\n inputs: the list of inputs to get the last hidden states from.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n )\n\n def _prepare_structured_output(\n self, structured_output: \"StructuredOutputType\"\n ) -> Union[Any, None]:\n \"\"\"Method in charge of preparing the structured output generator.\n\n By default will raise a `NotImplementedError`, subclasses that allow it must override this\n method with the implementation.\n\n Args:\n structured_output: the config to prepare the guided generation.\n\n Returns:\n The structure to be used for the guided generation.\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Guided generation is not implemented for `{type(self).__name__}`\"\n )\n\n def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n generation method to be implemented by each `LLM`.\n\n This method should create jobs the first time is called and store the job ids, so\n the second and subsequent calls can retrieve the results of the batch generation.\n If subsequent calls are made before the batch generation is finished, then the method\n should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n will be handled automatically by the `Pipeline` which will store all the required\n information for recovering the pipeline execution when the batch generation is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n raise DistilabelNotImplementedError(\n f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.model_name","title":"model_name: str abstractmethod property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property ","text":"Returns the parameters of the generate method. Returns: Type Description List[Parameter] A list containing the parameters of the generate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns the runtime parameters of the LLM , which are combination of the attributes of the LLM type hinted with RuntimeParameter and the parameters of the generate method that are not input and num_generations . Returns: Type Description RuntimeParametersNames A dictionary with the name of the runtime parameters as keys and a boolean RuntimeParametersNames indicating if the parameter is optional or not. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property ","text":"Returns the parsed docstring of the generate method. Returns: Type Description Docstring The parsed docstring of the generate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.load","title":"load() ","text":"Method to be called to initialize the LLM , its logger and optionally the structured output generator. Source code in src/distilabel/models/llms/base.py def load(self) -> None:\n \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n structured output generator.\"\"\"\n self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.unload","title":"unload() ","text":"Method to be called to unload the LLM and release any resources. Source code in src/distilabel/models/llms/base.py def unload(self) -> None:\n \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_generation_kwargs","title":"get_generation_kwargs() ","text":"Returns the generation kwargs to be used for the generation. This method can be overridden to provide a more complex logic for the generation kwargs. Returns: Type Description Dict[str, Any] The kwargs to be used for the generation. Source code in src/distilabel/models/llms/base.py def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs to be used for the generation. This method can\n be overridden to provide a more complex logic for the generation kwargs.\n\n Returns:\n The kwargs to be used for the generation.\n \"\"\"\n return self.generation_kwargs # type: ignore\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate","title":"generate(inputs, num_generations=1, **kwargs) abstractmethod ","text":"Abstract method to be implemented by each LLM to generate num_generations per input in inputs . Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for which follows OpenAI's API format: [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n]\n required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Source code in src/distilabel/models/llms/base.py @abstractmethod\ndef generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n per input in `inputs`.\n\n Args:\n inputs: the list of inputs to generate responses for which follows OpenAI's\n API format:\n\n ```python\n [\n {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n ]\n ```\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n \"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_outputs","title":"generate_outputs(inputs, num_generations=1, **kwargs) ","text":"Generates outputs for the given inputs using either generate method or the offine_batch_generate method if `use_offline_ Source code in src/distilabel/models/llms/base.py def generate_outputs(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Generates outputs for the given inputs using either `generate` method or the\n `offine_batch_generate` method if `use_offline_\n \"\"\"\n if self.use_offline_batch_generation:\n if self.offline_batch_generation_block_until_done is not None:\n return self._offline_batch_generate_polling(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n # if the batch generation is not finished.\n return self.offline_batch_generate(\n inputs=inputs,\n num_generations=num_generations,\n **kwargs,\n )\n\n return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Gets the information of the runtime parameters of the LLM such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the LLM . Returns: Type Description List[RuntimeParameterInfo] A list containing the information for each runtime parameter of the LLM . Source code in src/distilabel/models/llms/base.py def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n and the description. This function is meant to include the information of the runtime\n parameters in the serialized data of the `LLM`.\n\n Returns:\n A list containing the information for each runtime parameter of the `LLM`.\n \"\"\"\n runtime_parameters_info = super().get_runtime_parameters_info()\n\n generation_kwargs_info = next(\n (\n runtime_parameter_info\n for runtime_parameter_info in runtime_parameters_info\n if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n ),\n None,\n )\n\n # If `generation_kwargs` attribute is present, we need to include the `generate`\n # method arguments as the information for this attribute.\n if generation_kwargs_info:\n generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n generation_kwargs_info[\"keys\"] = []\n for key, value in generation_kwargs_info[\"optional\"].items():\n info = {\"name\": key, \"optional\": value}\n if description := generate_docstring_args.get(key):\n info[\"description\"] = description\n generation_kwargs_info[\"keys\"].append(info)\n\n generation_kwargs_info.pop(\"optional\")\n\n return runtime_parameters_info\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_last_hidden_states","title":"get_last_hidden_states(inputs) ","text":"Method to get the last hidden states of the model for a list of inputs. Parameters: Name Type Description Default inputs List[StandardInput] the list of inputs to get the last hidden states from. required Returns: Type Description List[HiddenState] A list containing the last hidden state for each sequence using a NumPy array with shape [num_tokens, hidden_size]. Source code in src/distilabel/models/llms/base.py def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n Args:\n inputs: the list of inputs to get the last hidden states from.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n # TODO: update to use `DistilabelNotImplementedError`\n raise NotImplementedError(\n f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, **kwargs) ","text":"Method to generate a list of outputs for the given inputs using an offline batch generation method to be implemented by each LLM . This method should create jobs the first time is called and store the job ids, so the second and subsequent calls can retrieve the results of the batch generation. If subsequent calls are made before the batch generation is finished, then the method should raise a DistilabelOfflineBatchGenerationNotFinishedException . This exception will be handled automatically by the Pipeline which will store all the required information for recovering the pipeline execution when the batch generation is finished. Parameters: Name Type Description Default inputs Union[List[FormattedInput], None] the list of inputs to generate responses for. None num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/base.py def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n generation method to be implemented by each `LLM`.\n\n This method should create jobs the first time is called and store the job ids, so\n the second and subsequent calls can retrieve the results of the batch generation.\n If subsequent calls are made before the batch generation is finished, then the method\n should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n will be handled automatically by the `Pipeline` which will store all the required\n information for recovering the pipeline execution when the batch generation is finished.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n raise DistilabelNotImplementedError(\n f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM","title":"AsyncLLM ","text":" Bases: LLM Abstract class for asynchronous LLMs, so as to benefit from the async capabilities of each LLM implementation. This class is meant to be subclassed by each LLM, and the method agenerate needs to be implemented to provide the asynchronous generation of responses. Attributes: Name Type Description _event_loop AbstractEventLoop the event loop to be used for the asynchronous generation of responses. Source code in src/distilabel/models/llms/base.py class AsyncLLM(LLM):\n \"\"\"Abstract class for asynchronous LLMs, so as to benefit from the async capabilities\n of each LLM implementation. This class is meant to be subclassed by each LLM, and the\n method `agenerate` needs to be implemented to provide the asynchronous generation of\n responses.\n\n Attributes:\n _event_loop: the event loop to be used for the asynchronous generation of responses.\n \"\"\"\n\n _num_generations_param_supported = True\n _event_loop: \"asyncio.AbstractEventLoop\" = PrivateAttr(default=None)\n _new_event_loop: bool = PrivateAttr(default=False)\n\n @property\n def generate_parameters(self) -> List[inspect.Parameter]:\n \"\"\"Returns the parameters of the `agenerate` method.\n\n Returns:\n A list containing the parameters of the `agenerate` method.\n \"\"\"\n return list(inspect.signature(self.agenerate).parameters.values())\n\n @cached_property\n def generate_parsed_docstring(self) -> \"Docstring\":\n \"\"\"Returns the parsed docstring of the `agenerate` method.\n\n Returns:\n The parsed docstring of the `agenerate` method.\n \"\"\"\n return parse_google_docstring(self.agenerate)\n\n @property\n def event_loop(self) -> \"asyncio.AbstractEventLoop\":\n if self._event_loop is None:\n try:\n self._event_loop = asyncio.get_running_loop()\n if self._event_loop.is_closed():\n self._event_loop = asyncio.new_event_loop() # type: ignore\n self._new_event_loop = True\n except RuntimeError:\n self._event_loop = asyncio.new_event_loop()\n self._new_event_loop = True\n asyncio.set_event_loop(self._event_loop)\n return self._event_loop\n\n @abstractmethod\n async def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n ) -> \"GenerateOutput\":\n \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n and executed concurrently in `generate` method.\n \"\"\"\n pass\n\n async def _agenerate(\n self, inputs: List[\"FormattedInput\"], num_generations: int = 1, **kwargs: Any\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n if self._num_generations_param_supported:\n tasks = [\n asyncio.create_task(\n self.agenerate(\n input=input, num_generations=num_generations, **kwargs\n )\n )\n for input in inputs\n ]\n result = await asyncio.gather(*tasks)\n return result\n\n tasks = [\n asyncio.create_task(self.agenerate(input=input, **kwargs))\n for input in inputs\n for _ in range(num_generations)\n ]\n outputs = await asyncio.gather(*tasks)\n return merge_responses(outputs, n=num_generations)\n\n def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of responses asynchronously, returning the output\n synchronously awaiting for the response of each input sent to `agenerate`.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n return self.event_loop.run_until_complete(\n self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n )\n\n def __del__(self) -> None:\n \"\"\"Closes the event loop when the object is deleted.\"\"\"\n if sys.meta_path is None:\n return\n\n if self._new_event_loop:\n if self._event_loop.is_running():\n self._event_loop.stop()\n self._event_loop.close()\n\n @staticmethod\n def _prepare_structured_output( # type: ignore\n structured_output: \"InstructorStructuredOutputType\",\n client: Any = None,\n framework: Optional[str] = None,\n ) -> Dict[str, Union[str, Any]]:\n \"\"\"Wraps the client and updates the schema to work store it internally as a json schema.\n\n Args:\n structured_output: The configuration dict to prepare the structured output.\n client: The client to wrap to generate structured output. Implemented to work\n with `instructor`.\n framework: The name of the framework.\n\n Returns:\n A dictionary containing the wrapped client and the schema to update the structured_output\n variable in case it is a pydantic model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.instructor import (\n prepare_instructor,\n )\n\n result = {}\n client = prepare_instructor(\n client,\n mode=structured_output.get(\"mode\"),\n framework=framework, # type: ignore\n )\n result[\"client\"] = client\n\n schema = structured_output.get(\"schema\")\n if not schema:\n raise DistilabelUserError(\n f\"The `structured_output` argument must contain a schema: {structured_output}\",\n page=\"sections/how_to_guides/advanced/structured_generation/#instructor\",\n )\n if inspect.isclass(schema) and issubclass(schema, BaseModel):\n # We want a json schema for the serialization, but instructor wants a pydantic BaseModel.\n structured_output[\"schema\"] = schema.model_json_schema() # type: ignore\n result[\"structured_output\"] = structured_output\n\n return result\n\n @staticmethod\n def _prepare_kwargs(\n arguments: Dict[str, Any], structured_output: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Helper method to update the kwargs with the structured output configuration,\n used in case they are defined.\n\n Args:\n arguments: The arguments that would be passed to the LLM as **kwargs.\n to update with the structured output configuration.\n structured_outputs: The structured output configuration to update the arguments.\n\n Returns:\n kwargs updated with the special arguments used by `instructor`.\n \"\"\"\n # We can deal with json schema or BaseModel, but we need to convert it to a BaseModel\n # for the Instructor client.\n schema = structured_output.get(\"schema\", {})\n\n # If there's already a pydantic model, we don't need to do anything,\n # otherwise, try to obtain one.\n if not (inspect.isclass(schema) and issubclass(schema, BaseModel)):\n from distilabel.steps.tasks.structured_outputs.utils import (\n json_schema_to_model,\n )\n\n if isinstance(schema, str):\n # In case it was saved in the dataset as a string.\n schema = json.loads(schema)\n\n try:\n schema = json_schema_to_model(schema)\n except Exception as e:\n raise ValueError(\n f\"Failed to convert the schema to a pydantic model, the model is too complex currently: {e}\"\n ) from e\n\n arguments.update(\n **{\n \"response_model\": schema,\n \"max_retries\": structured_output.get(\"max_retries\", 1),\n },\n )\n return arguments\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property ","text":"Returns the parameters of the agenerate method. Returns: Type Description List[Parameter] A list containing the parameters of the agenerate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property ","text":"Returns the parsed docstring of the agenerate method. Returns: Type Description Docstring The parsed docstring of the agenerate method. "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.agenerate","title":"agenerate(input, num_generations=1, **kwargs) abstractmethod async ","text":"Method to generate a num_generations responses for a given input asynchronously, and executed concurrently in generate method. Source code in src/distilabel/models/llms/base.py @abstractmethod\nasync def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n) -> \"GenerateOutput\":\n \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n and executed concurrently in `generate` method.\n \"\"\"\n pass\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate","title":"generate(inputs, num_generations=1, **kwargs) ","text":"Method to generate a list of responses asynchronously, returning the output synchronously awaiting for the response of each input sent to agenerate . Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for. required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/base.py def generate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Method to generate a list of responses asynchronously, returning the output\n synchronously awaiting for the response of each input sent to `agenerate`.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n return self.event_loop.run_until_complete(\n self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n )\n "},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.__del__","title":"__del__() ","text":"Closes the event loop when the object is deleted. Source code in src/distilabel/models/llms/base.py def __del__(self) -> None:\n \"\"\"Closes the event loop when the object is deleted.\"\"\"\n if sys.meta_path is None:\n return\n\n if self._new_event_loop:\n if self._event_loop.is_running():\n self._event_loop.stop()\n self._event_loop.close()\n "},{"location":"api/models/llm/#distilabel.models.llms.base.merge_responses","title":"merge_responses(responses, n=1) ","text":"Helper function to group the responses from LLM.agenerate method according to the number of generations requested. Parameters: Name Type Description Default responses List[GenerateOutput] the responses from the LLM.agenerate method. required n int number of responses to group together. Defaults to 1. 1 Returns: Type Description List[GenerateOutput] List of merged responses, where each merged response contains n generations List[GenerateOutput] and their corresponding statistics. Source code in src/distilabel/models/llms/base.py def merge_responses(\n responses: List[\"GenerateOutput\"], n: int = 1\n) -> List[\"GenerateOutput\"]:\n \"\"\"Helper function to group the responses from `LLM.agenerate` method according\n to the number of generations requested.\n\n Args:\n responses: the responses from the `LLM.agenerate` method.\n n: number of responses to group together. Defaults to 1.\n\n Returns:\n List of merged responses, where each merged response contains n generations\n and their corresponding statistics.\n \"\"\"\n if not responses:\n return []\n\n def chunks(lst, n):\n \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n for i in range(0, len(lst), n):\n yield list(islice(lst, i, i + n))\n\n extra_keys = [\n key for key in responses[0].keys() if key not in (\"generations\", \"statistics\")\n ]\n\n result = []\n for group in chunks(responses, n):\n merged = {\n \"generations\": [],\n \"statistics\": {\"input_tokens\": [], \"output_tokens\": []},\n }\n for response in group:\n merged[\"generations\"].append(response[\"generations\"][0])\n # Merge statistics\n for key in response[\"statistics\"]:\n if key not in merged[\"statistics\"]:\n merged[\"statistics\"][key] = []\n merged[\"statistics\"][key].append(response[\"statistics\"][key][0])\n # Merge extra keys returned by the `LLM`\n for extra_key in extra_keys:\n if extra_key not in merged:\n merged[extra_key] = []\n merged[extra_key].append(response[extra_key][0])\n result.append(merged)\n return result\n "},{"location":"api/models/llm/llm_gallery/","title":"LLM Gallery","text":"This section contains the existing LLM subclasses implemented in distilabel . "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms","title":"llms ","text":""},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM","title":"AnthropicLLM ","text":" Bases: AsyncLLM Anthropic LLM implementation running the Async API client. Attributes: Name Type Description model str the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. base_url Optional[RuntimeParameter[str]] the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally. timeout RuntimeParameter[float] the maximum time in seconds to wait for a response. Defaults to 600.0 . max_retries RuntimeParameter[int] The maximum number of times to retry the request before failing. Defaults to 6 . http_client Optional[AsyncClient] if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. _aclient Optional[AsyncAnthropic] the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method. Runtime parameters api_key : the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. base_url : the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\" . timeout : the maximum time in seconds to wait for a response. Defaults to 600.0 . max_retries : the maximum number of times to retry the request before failing. Defaults to 6 . Examples: Generate text: from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/anthropic.py class AnthropicLLM(AsyncLLM):\n \"\"\"Anthropic LLM implementation running the Async API client.\n\n Attributes:\n model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\",\n \"claude-3-sonnet-20240229\", etc. Available models can be checked here:\n [Anthropic: Models overview](https://docs.anthropic.com/claude/docs/models-overview).\n api_key: the API key to authenticate the requests to the Anthropic API. If not provided,\n it will be read from `ANTHROPIC_API_KEY` environment variable.\n base_url: the base URL to use for the Anthropic API. Defaults to `None` which means\n that `https://api.anthropic.com` will be used internally.\n timeout: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n max_retries: The maximum number of times to retry the request before failing. Defaults\n to `6`.\n http_client: if provided, an alternative HTTP client to use for calling Anthropic\n API. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key. It\n is meant to be used internally.\n _aclient: the `AsyncAnthropic` client to use for the Anthropic API. It is meant\n to be used internally. Set in the `load` method.\n\n Runtime parameters:\n - `api_key`: the API key to authenticate the requests to the Anthropic API. If not\n provided, it will be read from `ANTHROPIC_API_KEY` environment variable.\n - `base_url`: the base URL to use for the Anthropic API. Defaults to `\"https://api.anthropic.com\"`.\n - `timeout`: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n - `max_retries`: the maximum number of times to retry the request before failing.\n Defaults to `6`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnthropicLLM\n\n llm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import AnthropicLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\"\n ),\n description=\"The base URL to use for the Anthropic API.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ANTHROPIC_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Anthropic API.\",\n )\n timeout: RuntimeParameter[float] = Field(\n default=600.0,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n http_client: Optional[AsyncClient] = Field(default=None, exclude=True)\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(default=_ANTHROPIC_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncAnthropic\"] = PrivateAttr(...)\n\n def _check_model_exists(self) -> None:\n \"\"\"Checks if the specified model exists in the available models.\"\"\"\n from anthropic import AsyncAnthropic\n\n annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n models = [\n value\n for type_ in get_args(annotation)\n if get_origin(type_) is Literal\n for value in get_args(type_)\n ]\n\n if self.model not in models:\n raise ValueError(\n f\"Model {self.model} does not exist among available models. \"\n f\"The available models are {', '.join(models)}\"\n )\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n super().load()\n\n try:\n from anthropic import AsyncAnthropic\n except ImportError as ie:\n raise ImportError(\n \"Anthropic Python client is not installed. Please install it using\"\n \" `pip install anthropic`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._check_model_exists()\n\n self._aclient = AsyncAnthropic(\n api_key=self.api_key.get_secret_value(),\n base_url=self.base_url,\n timeout=self.timeout,\n http_client=self.http_client,\n max_retries=self.max_retries,\n )\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_tokens: int = 128,\n stop_sequences: Union[List[str], None] = None,\n temperature: float = 1.0,\n top_p: Union[float, None] = None,\n top_k: Union[int, None] = None,\n ) -> GenerateOutput:\n \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n Args:\n input: a single input in chat format to generate responses for.\n max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from anthropic._types import NOT_GIVEN\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"system\": (\n input.pop(0)[\"content\"]\n if input and input[0][\"role\"] == \"system\"\n else NOT_GIVEN\n ),\n \"max_tokens\": max_tokens,\n \"stream\": False,\n \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n \"temperature\": temperature,\n \"top_p\": NOT_GIVEN if top_p is None else top_p,\n \"top_k\": NOT_GIVEN if top_k is None else top_k,\n }\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion: Union[\"Message\", \"BaseModel\"] = await self._aclient.messages.create(\n **kwargs\n ) # type: ignore\n if structured_output:\n # raw_response = completion._raw_response\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n if (content := completion.content[0].text) is None:\n self._logger.warning(\n f\"Received no response using Anthropic client (model: '{self.model}').\"\n f\" Finish reason was: {completion.stop_reason}\"\n )\n return prepare_output([content], **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"Message\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.input_tokens],\n \"output_tokens\": [completion.usage.output_tokens],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM._check_model_exists","title":"_check_model_exists() ","text":"Checks if the specified model exists in the available models. Source code in src/distilabel/models/llms/anthropic.py def _check_model_exists(self) -> None:\n \"\"\"Checks if the specified model exists in the available models.\"\"\"\n from anthropic import AsyncAnthropic\n\n annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n models = [\n value\n for type_ in get_args(annotation)\n if get_origin(type_) is Literal\n for value in get_args(type_)\n ]\n\n if self.model not in models:\n raise ValueError(\n f\"Model {self.model} does not exist among available models. \"\n f\"The available models are {', '.join(models)}\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.load","title":"load() ","text":"Loads the AsyncAnthropic client to use the Anthropic async API. Source code in src/distilabel/models/llms/anthropic.py def load(self) -> None:\n \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n super().load()\n\n try:\n from anthropic import AsyncAnthropic\n except ImportError as ie:\n raise ImportError(\n \"Anthropic Python client is not installed. Please install it using\"\n \" `pip install anthropic`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._check_model_exists()\n\n self._aclient = AsyncAnthropic(\n api_key=self.api_key.get_secret_value(),\n base_url=self.base_url,\n timeout=self.timeout,\n http_client=self.http_client,\n max_retries=self.max_retries,\n )\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.agenerate","title":"agenerate(input, max_tokens=128, stop_sequences=None, temperature=1.0, top_p=None, top_k=None) async ","text":"Generates a response asynchronously, using the Anthropic Async API definition. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 stop_sequences Union[List[str], None] custom text sequences that will cause the model to stop generating. Defaults to NOT_GIVEN . None temperature float the temperature to use for the generation. Set only if top_p is None. Defaults to 1.0 . 1.0 top_p Union[float, None] the top-p value to use for the generation. Defaults to NOT_GIVEN . None top_k Union[int, None] the top-k value to use for the generation. Defaults to NOT_GIVEN . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/anthropic.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_tokens: int = 128,\n stop_sequences: Union[List[str], None] = None,\n temperature: float = 1.0,\n top_p: Union[float, None] = None,\n top_k: Union[int, None] = None,\n) -> GenerateOutput:\n \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n Args:\n input: a single input in chat format to generate responses for.\n max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from anthropic._types import NOT_GIVEN\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"anthropic\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"system\": (\n input.pop(0)[\"content\"]\n if input and input[0][\"role\"] == \"system\"\n else NOT_GIVEN\n ),\n \"max_tokens\": max_tokens,\n \"stream\": False,\n \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n \"temperature\": temperature,\n \"top_p\": NOT_GIVEN if top_p is None else top_p,\n \"top_k\": NOT_GIVEN if top_k is None else top_k,\n }\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion: Union[\"Message\", \"BaseModel\"] = await self._aclient.messages.create(\n **kwargs\n ) # type: ignore\n if structured_output:\n # raw_response = completion._raw_response\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n if (content := completion.content[0].text) is None:\n self._logger.warning(\n f\"Received no response using Anthropic client (model: '{self.model}').\"\n f\" Finish reason was: {completion.stop_reason}\"\n )\n return prepare_output([content], **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnyscaleLLM","title":"AnyscaleLLM ","text":" Bases: OpenAILLM Anyscale LLM implementation running the async API client of OpenAI. Attributes: Name Type Description model the model name to use for the LLM, e.g., google/gemma-7b-it . See the supported models under the \"Text Generation -> Supported Models\" section here. base_url Optional[RuntimeParameter[str]] the base URL to use for the Anyscale API requests. Defaults to None , which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set. _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. Examples: Generate text: from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/anyscale.py class AnyscaleLLM(OpenAILLM):\n \"\"\"Anyscale LLM implementation running the async API client of OpenAI.\n\n Attributes:\n model: the model name to use for the LLM, e.g., `google/gemma-7b-it`. See the\n supported models under the \"Text Generation -> Supported Models\" section\n [here](https://docs.endpoints.anyscale.com/).\n base_url: the base URL to use for the Anyscale API requests. Defaults to `None`, which\n means that the value set for the environment variable `ANYSCALE_BASE_URL` will be used, or\n \"https://api.endpoints.anyscale.com/v1\" if not set.\n api_key: the API key to authenticate the requests to the Anyscale API. Defaults to `None` which\n means that the value set for the environment variable `ANYSCALE_API_KEY` will be used, or\n `None` if not set.\n _api_key_env_var: the name of the environment variable to use for the API key.\n It is meant to be used internally.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnyscaleLLM\n\n llm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"ANYSCALE_BASE_URL\", \"https://api.endpoints.anyscale.com/v1\"\n ),\n description=\"The base URL to use for the Anyscale API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ANYSCALE_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Anyscale API.\",\n )\n\n _api_key_env_var: str = PrivateAttr(_ANYSCALE_API_KEY_ENV_VAR_NAME)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM","title":"AzureOpenAILLM ","text":" Bases: OpenAILLM Azure OpenAI LLM implementation running the async API client. Attributes: Name Type Description model the model name to use for the LLM i.e. the name of the Azure deployment. base_url Optional[RuntimeParameter[str]] the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT . Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set. api_version Optional[RuntimeParameter[str]] the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set. Icon :material-microsoft-azure: Examples: Generate text: from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate text from a custom endpoint following the OpenAI API: from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/azure.py class AzureOpenAILLM(OpenAILLM):\n \"\"\"Azure OpenAI LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM i.e. the name of the Azure deployment.\n base_url: the base URL to use for the Azure OpenAI API can be set with `AZURE_OPENAI_ENDPOINT`.\n Defaults to `None` which means that the value set for the environment variable\n `AZURE_OPENAI_ENDPOINT` will be used, or `None` if not set.\n api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to `None`\n which means that the value set for the environment variable `AZURE_OPENAI_API_KEY` will be\n used, or `None` if not set.\n api_version: the API version to use for the Azure OpenAI API. Defaults to `None` which means\n that the value set for the environment variable `OPENAI_API_VERSION` will be used, or\n `None` if not set.\n\n Icon:\n `:material-microsoft-azure:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AzureOpenAILLM\n\n llm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate text from a custom endpoint following the OpenAI API:\n\n ```python\n from distilabel.models.llms import AzureOpenAILLM\n\n llm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import AzureOpenAILLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME),\n description=\"The base URL to use for the Azure OpenAI API requests i.e. the Azure OpenAI endpoint.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Azure OpenAI API.\",\n )\n\n api_version: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\"OPENAI_API_VERSION\"),\n description=\"The API version to use for the Azure OpenAI API.\",\n )\n\n _base_url_env_var: str = PrivateAttr(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME)\n _api_key_env_var: str = PrivateAttr(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncAzureOpenAI\"] = PrivateAttr(...) # type: ignore\n\n @override\n def load(self) -> None:\n \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n # in the load method before we have the proper client.\n with patch(\n \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n ):\n super().load()\n\n try:\n from openai import AsyncAzureOpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n # TODO: May be worth adding the AD auth too? Also the `organization`?\n self._aclient = AsyncAzureOpenAI( # type: ignore\n azure_endpoint=self.base_url, # type: ignore\n azure_deployment=self.model,\n api_version=self.api_version,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n self._prepare_structured_output(self.structured_output)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM.load","title":"load() ","text":"Loads the AsyncAzureOpenAI client to benefit from async requests. Source code in src/distilabel/models/llms/azure.py @override\ndef load(self) -> None:\n \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n # in the load method before we have the proper client.\n with patch(\n \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n ):\n super().load()\n\n try:\n from openai import AsyncAzureOpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n # TODO: May be worth adding the AD auth too? Also the `organization`?\n self._aclient = AsyncAzureOpenAI( # type: ignore\n azure_endpoint=self.base_url, # type: ignore\n azure_deployment=self.model,\n api_version=self.api_version,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n self._prepare_structured_output(self.structured_output)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM","title":"CohereLLM ","text":" Bases: AsyncLLM Cohere API implementation using the async client for concurrent text generation. Attributes: Name Type Description model str the name of the model from the Cohere API to use for the generation. base_url Optional[RuntimeParameter[str]] the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . client_name RuntimeParameter[str] the name of the client to use for the API requests. Defaults to \"distilabel\" . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _ChatMessage Type[ChatMessage] the ChatMessage class from the cohere package. _aclient AsyncClient the AsyncClient client from the cohere package. Runtime parameters base_url : the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . api_key : the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . client_name : the name of the client to use for the API requests. Defaults to \"distilabel\" . Examples: Generate text: from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import CohereLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = CohereLLM(\n model=\"CohereForAI/c4ai-command-r-plus\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/cohere.py class CohereLLM(AsyncLLM):\n \"\"\"Cohere API implementation using the async client for concurrent text generation.\n\n Attributes:\n model: the name of the model from the Cohere API to use for the generation.\n base_url: the base URL to use for the Cohere API requests. Defaults to\n `\"https://api.cohere.ai/v1\"`.\n api_key: the API key to authenticate the requests to the Cohere API. Defaults to\n the value of the `COHERE_API_KEY` environment variable.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n client_name: the name of the client to use for the API requests. Defaults to\n `\"distilabel\"`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _ChatMessage: the `ChatMessage` class from the `cohere` package.\n _aclient: the `AsyncClient` client from the `cohere` package.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the Cohere API requests. Defaults to\n `\"https://api.cohere.ai/v1\"`.\n - `api_key`: the API key to authenticate the requests to the Cohere API. Defaults\n to the value of the `COHERE_API_KEY` environment variable.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n - `client_name`: the name of the client to use for the API requests. Defaults to\n `\"distilabel\"`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import CohereLLM\n\n llm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import CohereLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = CohereLLM(\n model=\"CohereForAI/c4ai-command-r-plus\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"COHERE_BASE_URL\", \"https://api.cohere.ai/v1\"\n ),\n description=\"The base URL to use for the Cohere API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_COHERE_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Cohere API.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n client_name: RuntimeParameter[str] = Field(\n default=\"distilabel\",\n description=\"The name of the client to use for the API requests.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _ChatMessage: Type[\"ChatMessage\"] = PrivateAttr(...)\n _aclient: \"AsyncClient\" = PrivateAttr(...)\n _tokenizer: \"Tokenizer\" = PrivateAttr(...)\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n super().load()\n\n try:\n from cohere import AsyncClient, ChatMessage\n except ImportError as ie:\n raise ImportError(\n \"The `cohere` package is required to use the `CohereLLM` class.\"\n ) from ie\n\n self._ChatMessage = ChatMessage\n\n self._aclient = AsyncClient(\n api_key=self.api_key.get_secret_value(), # type: ignore\n client_name=self.client_name,\n base_url=self.base_url,\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n from cohere.manually_maintained.tokenizers import get_hf_tokenizer\n\n self._tokenizer: \"Tokenizer\" = get_hf_tokenizer(self._aclient, self.model)\n\n def _format_chat_to_cohere(\n self, input: \"FormattedInput\"\n ) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n Args:\n input: The chat input to format.\n\n Returns:\n A tuple containing the system, chat history, and message.\n \"\"\"\n system = None\n message = None\n chat_history = []\n for item in input:\n role = item[\"role\"]\n content = item[\"content\"]\n if role == \"system\":\n system = content\n elif role == \"user\":\n message = content\n elif role == \"assistant\":\n if message is None:\n raise ValueError(\n \"An assistant message but be preceded by a user message.\"\n )\n chat_history.append(self._ChatMessage(role=\"USER\", message=message)) # type: ignore\n chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content)) # type: ignore\n message = None\n\n if message is None:\n raise ValueError(\"The chat input must end with a user message.\")\n\n return system, chat_history, message\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n temperature: Optional[float] = None,\n max_tokens: Optional[int] = None,\n k: Optional[int] = None,\n p: Optional[float] = None,\n seed: Optional[float] = None,\n stop_sequences: Optional[Sequence[str]] = None,\n frequency_penalty: Optional[float] = None,\n presence_penalty: Optional[float] = None,\n raw_prompting: Optional[bool] = None,\n ) -> GenerateOutput:\n \"\"\"Generates a response from the LLM given an input.\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: the temperature to use for the generation. Defaults to `None`.\n max_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `None`.\n k: the number of highest probability vocabulary tokens to keep for the generation.\n Defaults to `None`.\n p: the nucleus sampling probability to use for the generation. Defaults to\n `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: a list of sequences to use as stopping criteria for the generation.\n Defaults to `None`.\n frequency_penalty: the frequency penalty to use for the generation. Defaults\n to `None`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `None`.\n raw_prompting: a flag to use raw prompting for the generation. Defaults to\n `None`.\n\n Returns:\n The generated response from the Cohere API model.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n system, chat_history, message = self._format_chat_to_cohere(input)\n\n kwargs = {\n \"message\": message,\n \"model\": self.model,\n \"preamble\": system,\n \"chat_history\": chat_history,\n \"temperature\": temperature,\n \"max_tokens\": max_tokens,\n \"k\": k,\n \"p\": p,\n \"seed\": seed,\n \"stop_sequences\": stop_sequences,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"raw_prompting\": raw_prompting,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n response: Union[\"Message\", \"BaseModel\"] = await self._aclient.chat(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [response.model_dump_json()],\n **self._get_llm_statistics(\n input, orjson.dumps(response.model_dump_json()).decode(\"utf-8\")\n ), # type: ignore\n )\n\n if (text := response.text) == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response using Cohere client (model: '{self.model}').\"\n f\" Finish reason was: {response.finish_reason}\"\n )\n return prepare_output(\n [None],\n **self._get_llm_statistics(input, \"\"),\n )\n\n return prepare_output(\n [text],\n **self._get_llm_statistics(input, text),\n )\n\n def _get_llm_statistics(\n self, input: FormattedInput, output: str\n ) -> \"LLMStatistics\":\n return {\n \"input_tokens\": [compute_tokens(input, self._tokenizer.encode)],\n \"output_tokens\": [compute_tokens(output, self._tokenizer.encode)],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.load","title":"load() ","text":"Loads the AsyncClient client from the cohere package. Source code in src/distilabel/models/llms/cohere.py def load(self) -> None:\n \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n super().load()\n\n try:\n from cohere import AsyncClient, ChatMessage\n except ImportError as ie:\n raise ImportError(\n \"The `cohere` package is required to use the `CohereLLM` class.\"\n ) from ie\n\n self._ChatMessage = ChatMessage\n\n self._aclient = AsyncClient(\n api_key=self.api_key.get_secret_value(), # type: ignore\n client_name=self.client_name,\n base_url=self.base_url,\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n from cohere.manually_maintained.tokenizers import get_hf_tokenizer\n\n self._tokenizer: \"Tokenizer\" = get_hf_tokenizer(self._aclient, self.model)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM._format_chat_to_cohere","title":"_format_chat_to_cohere(input) ","text":"Formats the chat input to the Cohere Chat API conversational format. Parameters: Name Type Description Default input FormattedInput The chat input to format. required Returns: Type Description Tuple[Union[str, None], List[ChatMessage], str] A tuple containing the system, chat history, and message. Source code in src/distilabel/models/llms/cohere.py def _format_chat_to_cohere(\n self, input: \"FormattedInput\"\n) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n Args:\n input: The chat input to format.\n\n Returns:\n A tuple containing the system, chat history, and message.\n \"\"\"\n system = None\n message = None\n chat_history = []\n for item in input:\n role = item[\"role\"]\n content = item[\"content\"]\n if role == \"system\":\n system = content\n elif role == \"user\":\n message = content\n elif role == \"assistant\":\n if message is None:\n raise ValueError(\n \"An assistant message but be preceded by a user message.\"\n )\n chat_history.append(self._ChatMessage(role=\"USER\", message=message)) # type: ignore\n chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content)) # type: ignore\n message = None\n\n if message is None:\n raise ValueError(\"The chat input must end with a user message.\")\n\n return system, chat_history, message\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.agenerate","title":"agenerate(input, temperature=None, max_tokens=None, k=None, p=None, seed=None, stop_sequences=None, frequency_penalty=None, presence_penalty=None, raw_prompting=None) async ","text":"Generates a response from the LLM given an input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required temperature Optional[float] the temperature to use for the generation. Defaults to None . None max_tokens Optional[int] the maximum number of new tokens that the model will generate. Defaults to None . None k Optional[int] the number of highest probability vocabulary tokens to keep for the generation. Defaults to None . None p Optional[float] the nucleus sampling probability to use for the generation. Defaults to None . None seed Optional[float] the seed to use for the generation. Defaults to None . None stop_sequences Optional[Sequence[str]] a list of sequences to use as stopping criteria for the generation. Defaults to None . None frequency_penalty Optional[float] the frequency penalty to use for the generation. Defaults to None . None presence_penalty Optional[float] the presence penalty to use for the generation. Defaults to None . None raw_prompting Optional[bool] a flag to use raw prompting for the generation. Defaults to None . None Returns: Type Description GenerateOutput The generated response from the Cohere API model. Source code in src/distilabel/models/llms/cohere.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n temperature: Optional[float] = None,\n max_tokens: Optional[int] = None,\n k: Optional[int] = None,\n p: Optional[float] = None,\n seed: Optional[float] = None,\n stop_sequences: Optional[Sequence[str]] = None,\n frequency_penalty: Optional[float] = None,\n presence_penalty: Optional[float] = None,\n raw_prompting: Optional[bool] = None,\n) -> GenerateOutput:\n \"\"\"Generates a response from the LLM given an input.\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: the temperature to use for the generation. Defaults to `None`.\n max_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `None`.\n k: the number of highest probability vocabulary tokens to keep for the generation.\n Defaults to `None`.\n p: the nucleus sampling probability to use for the generation. Defaults to\n `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: a list of sequences to use as stopping criteria for the generation.\n Defaults to `None`.\n frequency_penalty: the frequency penalty to use for the generation. Defaults\n to `None`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `None`.\n raw_prompting: a flag to use raw prompting for the generation. Defaults to\n `None`.\n\n Returns:\n The generated response from the Cohere API model.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"cohere\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n system, chat_history, message = self._format_chat_to_cohere(input)\n\n kwargs = {\n \"message\": message,\n \"model\": self.model,\n \"preamble\": system,\n \"chat_history\": chat_history,\n \"temperature\": temperature,\n \"max_tokens\": max_tokens,\n \"k\": k,\n \"p\": p,\n \"seed\": seed,\n \"stop_sequences\": stop_sequences,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"raw_prompting\": raw_prompting,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n response: Union[\"Message\", \"BaseModel\"] = await self._aclient.chat(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [response.model_dump_json()],\n **self._get_llm_statistics(\n input, orjson.dumps(response.model_dump_json()).decode(\"utf-8\")\n ), # type: ignore\n )\n\n if (text := response.text) == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response using Cohere client (model: '{self.model}').\"\n f\" Finish reason was: {response.finish_reason}\"\n )\n return prepare_output(\n [None],\n **self._get_llm_statistics(input, \"\"),\n )\n\n return prepare_output(\n [text],\n **self._get_llm_statistics(input, text),\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM","title":"GroqLLM ","text":" Bases: AsyncLLM Groq API implementation using the async client for concurrent text generation. Attributes: Name Type Description model str the name of the model from the Groq API to use for the generation. base_url Optional[RuntimeParameter[str]] the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. max_retries RuntimeParameter[int] the maximum number of times to retry the request to the API before failing. Defaults to 2 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. _aclient Optional[AsyncGroq] the AsyncGroq client from the groq package. Runtime parameters base_url : the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . api_key : the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 2 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . Examples: Generate text: from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import GroqLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = GroqLLM(\n model=\"llama3-70b-8192\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/groq.py class GroqLLM(AsyncLLM):\n \"\"\"Groq API implementation using the async client for concurrent text generation.\n\n Attributes:\n model: the name of the model from the Groq API to use for the generation.\n base_url: the base URL to use for the Groq API requests. Defaults to\n `\"https://api.groq.com\"`.\n api_key: the API key to authenticate the requests to the Groq API. Defaults to\n the value of the `GROQ_API_KEY` environment variable.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `2`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key.\n _aclient: the `AsyncGroq` client from the `groq` package.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the Groq API requests. Defaults to\n `\"https://api.groq.com\"`.\n - `api_key`: the API key to authenticate the requests to the Groq API. Defaults to\n the value of the `GROQ_API_KEY` environment variable.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `2`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import GroqLLM\n\n llm = GroqLLM(model=\"llama3-70b-8192\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import GroqLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = GroqLLM(\n model=\"llama3-70b-8192\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n _GROQ_API_BASE_URL_ENV_VAR_NAME, \"https://api.groq.com\"\n ),\n description=\"The base URL to use for the Groq API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_GROQ_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Groq API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=2,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(_GROQ_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"AsyncGroq\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from groq import AsyncGroq\n except ImportError as ie:\n raise ImportError(\n \"Groq Python client is not installed. Please install it using\"\n ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = AsyncGroq(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n seed: Optional[int] = None,\n max_new_tokens: int = 128,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[str] = None,\n ) -> \"GenerateOutput\":\n \"\"\"Generates `num_generations` responses for the given input using the Groq async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n seed: the seed to use for the generation. Defaults to `None`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: the stop sequence to use for the generation. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n References:\n - https://console.groq.com/docs/text-chat\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"seed\": seed,\n \"temperature\": temperature,\n \"max_tokens\": max_new_tokens,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n generations = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using the Groq client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n return prepare_output(generations, **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"ChatCompletion\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.prompt_tokens if completion else 0],\n \"output_tokens\": [completion.usage.completion_tokens if completion else 0],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.load","title":"load() ","text":"Loads the AsyncGroq client to benefit from async requests. Source code in src/distilabel/models/llms/groq.py def load(self) -> None:\n \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from groq import AsyncGroq\n except ImportError as ie:\n raise ImportError(\n \"Groq Python client is not installed. Please install it using\"\n ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = AsyncGroq(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.agenerate","title":"agenerate(input, seed=None, max_new_tokens=128, temperature=1.0, top_p=1.0, stop=None) async ","text":"Generates num_generations responses for the given input using the Groq async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required seed Optional[int] the seed to use for the generation. Defaults to None . None max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[str] the stop sequence to use for the generation. Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. References - https://console.groq.com/docs/text-chat
Source code in src/distilabel/models/llms/groq.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n seed: Optional[int] = None,\n max_new_tokens: int = 128,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[str] = None,\n) -> \"GenerateOutput\":\n \"\"\"Generates `num_generations` responses for the given input using the Groq async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n seed: the seed to use for the generation. Defaults to `None`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: the stop sequence to use for the generation. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n References:\n - https://console.groq.com/docs/text-chat\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"groq\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"seed\": seed,\n \"temperature\": temperature,\n \"max_tokens\": max_new_tokens,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n generations = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using the Groq client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM","title":"InferenceEndpointsLLM ","text":" Bases: AsyncLLM , MagpieChatTemplateMixin InferenceEndpoints LLM implementation running the async API client. This LLM will internally use huggingface_hub.AsyncInferenceClient . Attributes: Name Type Description model_id Optional[str] the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None . endpoint_name Optional[RuntimeParameter[str]] the name of the Inference Endpoint to use for the LLM. Defaults to None . endpoint_namespace Optional[RuntimeParameter[str]] the namespace of the Inference Endpoint to use for the LLM. Defaults to None . base_url Optional[RuntimeParameter[str]] the base URL to use for the Inference Endpoints API requests. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Inference Endpoints API. tokenizer_id Optional[str] the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None , but defining one is recommended to properly format the prompt. model_display_name Optional[str] the model display name to use for the LLM. Defaults to None . use_magpie_template Optional[str] a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template Optional[str] the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . structured_output Optional[RuntimeParameter[StructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. Icon :hugging: Examples: Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Dedicated Inference Endpoints: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Dedicated Inference Endpoints or TGI: from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py class InferenceEndpointsLLM(AsyncLLM, MagpieChatTemplateMixin):\n \"\"\"InferenceEndpoints LLM implementation running the async API client.\n\n This LLM will internally use `huggingface_hub.AsyncInferenceClient`.\n\n Attributes:\n model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which\n will be used to resolve the base URL for the serverless Inference Endpoints API requests.\n Defaults to `None`.\n endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to `None`.\n endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to `None`.\n base_url: the base URL to use for the Inference Endpoints API requests.\n api_key: the API key to authenticate the requests to the Inference Endpoints API.\n tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub.\n Defaults to `None`, but defining one is recommended to properly format the prompt.\n model_display_name: the model display name to use for the LLM. Defaults to `None`.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n structured_output: a dictionary containing the structured output configuration or\n if more fine-grained control is needed, an instance of `OutlinesStructuredOutput`.\n Defaults to None.\n\n Icon:\n `:hugging:`\n\n Examples:\n Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Dedicated Inference Endpoints:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Dedicated Inference Endpoints or TGI:\n\n ```python\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import InferenceEndpointsLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n ```\n \"\"\"\n\n model_id: Optional[str] = None\n\n endpoint_name: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The name of the Inference Endpoint to use for the LLM.\",\n )\n endpoint_namespace: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The namespace of the Inference Endpoint to use for the LLM.\",\n )\n base_url: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The base URL to use for the Inference Endpoints API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR),\n description=\"The API key to authenticate the requests to the Inference Endpoints API.\",\n )\n\n tokenizer_id: Optional[str] = None\n model_display_name: Optional[str] = None\n\n structured_output: Optional[RuntimeParameter[StructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _num_generations_param_supported = False\n\n _model_name: Optional[str] = PrivateAttr(default=None)\n _tokenizer: Optional[\"PreTrainedTokenizer\"] = PrivateAttr(default=None)\n _api_key_env_var: str = PrivateAttr(HF_TOKEN_ENV_VAR)\n _aclient: Optional[\"AsyncInferenceClient\"] = PrivateAttr(...)\n\n @model_validator(mode=\"after\") # type: ignore\n def only_one_of_model_id_endpoint_name_or_base_url_provided(\n self,\n ) -> \"InferenceEndpointsLLM\":\n \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n favour of the dynamically calculated one..\"\"\"\n\n if self.base_url and (self.model_id or self.endpoint_name):\n self._logger.warning( # type: ignore\n f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n \" or overwritten with the one generated from either of those args, for serverless\"\n \" or dedicated inference endpoints, respectively.\"\n )\n\n if self.use_magpie_template and self.tokenizer_id is None:\n raise ValueError(\n \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n \" set a `tokenizer_id` and try again.\"\n )\n\n if (\n self.model_id\n and self.tokenizer_id is None\n and self.structured_output is not None\n ):\n self.tokenizer_id = self.model_id\n\n if self.base_url and not (self.model_id or self.endpoint_name):\n return self\n\n if self.model_id and not self.endpoint_name:\n return self\n\n if self.endpoint_name and not self.model_id:\n return self\n\n raise ValidationError(\n f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n )\n\n def load(self) -> None: # noqa: C901\n \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n Endpoint.\n\n Raises:\n ImportError: if the `huggingface-hub` Python client is not installed.\n ValueError: if the model is not currently deployed or is not running the TGI framework.\n ImportError: if the `transformers` Python client is not installed.\n \"\"\"\n super().load()\n\n try:\n from huggingface_hub import (\n AsyncInferenceClient,\n InferenceClient,\n get_inference_endpoint,\n )\n except ImportError as ie:\n raise ImportError(\n \"Hugging Face Hub Python client is not installed. Please install it using\"\n \" `pip install huggingface-hub`.\"\n ) from ie\n\n if self.api_key is None:\n self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n if self.model_id is not None:\n client = InferenceClient(\n model=self.model_id, token=self.api_key.get_secret_value()\n )\n status = client.get_model_status()\n\n if (\n status.state not in {\"Loadable\", \"Loaded\"}\n and status.framework != \"text-generation-inference\"\n ):\n raise ValueError(\n f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n )\n\n self.base_url = client._resolve_url(\n model=self.model_id, task=\"text-generation\"\n )\n\n if self.endpoint_name is not None:\n client = get_inference_endpoint(\n name=self.endpoint_name,\n namespace=self.endpoint_namespace,\n token=self.api_key.get_secret_value(),\n )\n if client.status in [\"paused\", \"scaledToZero\"]:\n client.resume().wait(timeout=300)\n elif client.status == \"initializing\":\n client.wait(timeout=300)\n\n self.base_url = client.url\n self._model_name = client.repository\n\n self._aclient = AsyncInferenceClient(\n base_url=self.base_url,\n token=self.api_key.get_secret_value(),\n )\n\n if self.tokenizer_id:\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"Transformers Python client is not installed. Please install it using\"\n \" `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n\n @property\n @override\n def model_name(self) -> Union[str, None]: # type: ignore\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return (\n self.model_display_name\n or self._model_name\n or self.model_id\n or self.endpoint_name\n or self.base_url\n )\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n conversation=input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n def _get_structured_output(\n self, input: FormattedInput\n ) -> Tuple[\"StandardInput\", Union[Dict[str, Any], None]]:\n \"\"\"Gets the structured output (if any) for the given input.\n\n Args:\n input: a single input in chat format to generate responses for.\n\n Returns:\n The input and the structured output that will be passed as `grammar` to the\n inference endpoint or `None` if not required.\n \"\"\"\n structured_output = None\n\n # Specific structured output per input\n if isinstance(input, tuple):\n input, structured_output = input\n structured_output = {\n \"type\": structured_output[\"format\"], # type: ignore\n \"value\": structured_output[\"schema\"], # type: ignore\n }\n\n # Same structured output for all the inputs\n if structured_output is None and self.structured_output is not None:\n try:\n structured_output = {\n \"type\": self.structured_output[\"format\"], # type: ignore\n \"value\": self.structured_output[\"schema\"], # type: ignore\n }\n except KeyError as e:\n raise ValueError(\n \"To use the structured output you have to inform the `format` and `schema` in \"\n \"the `structured_output` attribute.\"\n ) from e\n\n if structured_output:\n if isinstance(structured_output[\"value\"], ModelMetaclass):\n structured_output[\"value\"] = structured_output[\n \"value\"\n ].model_json_schema()\n\n return input, structured_output\n\n async def _generate_with_text_generation(\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n repetition_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n temperature: float = 1.0,\n do_sample: bool = False,\n top_n_tokens: Optional[int] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n stop_sequences: Union[List[str], None] = None,\n return_full_text: bool = False,\n seed: Optional[int] = None,\n watermark: bool = False,\n ) -> GenerateOutput:\n input, structured_output = self._get_structured_output(input)\n prompt = self.prepare_input(input)\n generation: Union[\"TextGenerationOutput\", None] = None\n try:\n generation = await self._aclient.text_generation( # type: ignore\n prompt=prompt,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n # NOTE: here to ensure that the cache is not used and a different response is\n # generated every time\n seed=seed or random.randint(0, sys.maxsize),\n watermark=watermark,\n grammar=structured_output, # type: ignore\n details=True,\n )\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n return prepare_output(\n generations=[generation.generated_text] if generation else [None],\n input_tokens=[compute_tokens(prompt, self._tokenizer.encode)], # type: ignore\n output_tokens=[\n generation.details.generated_tokens\n if generation and generation.details\n else 0\n ],\n logprobs=self._get_logprobs_from_text_generation(generation)\n if generation\n else None, # type: ignore\n )\n\n def _get_logprobs_from_text_generation(\n self, generation: \"TextGenerationOutput\"\n ) -> Union[List[List[List[\"Logprob\"]]], None]:\n if generation.details is None or generation.details.top_tokens is None:\n return None\n\n return [\n [\n [\n {\"token\": top_logprob[\"text\"], \"logprob\": top_logprob[\"logprob\"]}\n for top_logprob in token_logprobs\n ]\n for token_logprobs in generation.details.top_tokens\n ]\n ]\n\n async def _generate_with_chat_completion(\n self,\n input: \"StandardInput\",\n max_new_tokens: int = 128,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[float] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n ) -> GenerateOutput:\n message = None\n completion: Union[\"ChatCompletionOutput\", None] = None\n output_logprobs = None\n try:\n completion = await self._aclient.chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n # NOTE: here to ensure that the cache is not used and a different response is\n # generated every time\n seed=seed or random.randint(0, sys.maxsize),\n stop=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice, # type: ignore\n tool_prompt=tool_prompt,\n tools=tools, # type: ignore\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n choice = completion.choices[0] # type: ignore\n if (message := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n output_logprobs = [choice_logprobs]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n return prepare_output(\n generations=[message],\n input_tokens=[completion.usage.prompt_tokens] if completion else None,\n output_tokens=[completion.usage.completion_tokens] if completion else None,\n logprobs=output_logprobs,\n )\n\n def _get_logprobs_from_choice(\n self, choice: \"ChatCompletionOutputComplete\"\n ) -> Union[List[List[\"Logprob\"]], None]:\n if choice.logprobs is None:\n return None\n\n return [\n [\n {\"token\": top_logprob.token, \"logprob\": top_logprob.logprob}\n for top_logprob in token_logprobs.top_logprobs\n ]\n for token_logprobs in choice.logprobs.content\n ]\n\n def _check_stop_sequences(\n self,\n stop_sequences: Optional[Union[str, List[str]]] = None,\n ) -> Union[List[str], None]:\n \"\"\"Checks that no more than 4 stop sequences are provided.\n\n Args:\n stop_sequences: the stop sequences to be checked.\n\n Returns:\n The stop sequences.\n \"\"\"\n if stop_sequences is not None:\n if isinstance(stop_sequences, str):\n stop_sequences = [stop_sequences]\n if len(stop_sequences) > 4:\n warnings.warn(\n \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n UserWarning,\n stacklevel=2,\n )\n stop_sequences = stop_sequences[:4]\n return stop_sequences\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_n_tokens: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n do_sample: bool = False,\n repetition_penalty: Optional[float] = None,\n return_full_text: bool = False,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n watermark: bool = False,\n ) -> GenerateOutput:\n \"\"\"Generates completions for the given input using the async client. This method\n uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n Some arguments of this function are specific to the `text_generation` method, while\n some others are specific to the `chat_completion` method.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on their existing frequency in the text so far, decreasing\n model's likelihood to repeat the same line verbatim. Defauls to `None`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`.\n Defaults to `None`.\n logprobs: whether to return the log probabilities or not. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defaults to `False`.\n presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on whether they appear in the text so far, increasing the\n model likelihood to talk about new topics. This argument is exclusive to\n the `chat_completion` method and will be used only if `tokenizer_id` is\n `None`. Defauls to `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: either a single string or a list of strings containing the sequences\n to stop the generation at. Defaults to `None`, but will be set to the\n `tokenizer.eos_token` if available.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n tool_choice: the name of the tool the model should call. It can be a dictionary\n like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n model won't use any tool. This argument is exclusive to the `chat_completion`\n method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defauls to `None`.\n tools: a list of tools definitions that the LLM can use.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`. Defaults to `None`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. This argument is exclusive to the `chat_completion` method and\n will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n top_n_tokens: the number of top log probabilities to return per output token\n generated. This argument is exclusive of the `text_generation` method and\n will be only used if `tokenizer_id` is not `None`. Defaults to `None`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n do_sample: whether to use sampling for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id` is not\n `None`. Defaults to `False`.\n repetition_penalty: the repetition penalty to use for the generation. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n return_full_text: whether to return the full text of the completion or just\n the generated text. Defaults to `False`, meaning that only the generated\n text will be returned. This argument is exclusive of the `text_generation`\n method and will be only used if `tokenizer_id` is not `None`.\n top_k: the top-k value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n values in TGI.\n typical_p: the typical-p value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n watermark: whether to add the watermark to the generated text. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n stop_sequences = self._check_stop_sequences(stop_sequences)\n\n if self.tokenizer_id is None:\n return await self._generate_with_chat_completion(\n input=input, # type: ignore\n max_new_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n seed=seed,\n stop_sequences=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice,\n tool_prompt=tool_prompt,\n tools=tools,\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n\n return await self._generate_with_text_generation(\n input=input,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n seed=seed,\n watermark=watermark,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.model_name","title":"model_name: Union[str, None] property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.only_one_of_model_id_endpoint_name_or_base_url_provided","title":"only_one_of_model_id_endpoint_name_or_base_url_provided() ","text":"Validates that only one of model_id or endpoint_name is provided; and if base_url is also provided, a warning will be shown informing the user that the provided base_url will be ignored in favour of the dynamically calculated one.. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py @model_validator(mode=\"after\") # type: ignore\ndef only_one_of_model_id_endpoint_name_or_base_url_provided(\n self,\n) -> \"InferenceEndpointsLLM\":\n \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n favour of the dynamically calculated one..\"\"\"\n\n if self.base_url and (self.model_id or self.endpoint_name):\n self._logger.warning( # type: ignore\n f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n \" or overwritten with the one generated from either of those args, for serverless\"\n \" or dedicated inference endpoints, respectively.\"\n )\n\n if self.use_magpie_template and self.tokenizer_id is None:\n raise ValueError(\n \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n \" set a `tokenizer_id` and try again.\"\n )\n\n if (\n self.model_id\n and self.tokenizer_id is None\n and self.structured_output is not None\n ):\n self.tokenizer_id = self.model_id\n\n if self.base_url and not (self.model_id or self.endpoint_name):\n return self\n\n if self.model_id and not self.endpoint_name:\n return self\n\n if self.endpoint_name and not self.model_id:\n return self\n\n raise ValidationError(\n f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.load","title":"load() ","text":"Loads the AsyncInferenceClient client to connect to the Hugging Face Inference Endpoint. Raises: Type Description ImportError if the huggingface-hub Python client is not installed. ValueError if the model is not currently deployed or is not running the TGI framework. ImportError if the transformers Python client is not installed. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def load(self) -> None: # noqa: C901\n \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n Endpoint.\n\n Raises:\n ImportError: if the `huggingface-hub` Python client is not installed.\n ValueError: if the model is not currently deployed or is not running the TGI framework.\n ImportError: if the `transformers` Python client is not installed.\n \"\"\"\n super().load()\n\n try:\n from huggingface_hub import (\n AsyncInferenceClient,\n InferenceClient,\n get_inference_endpoint,\n )\n except ImportError as ie:\n raise ImportError(\n \"Hugging Face Hub Python client is not installed. Please install it using\"\n \" `pip install huggingface-hub`.\"\n ) from ie\n\n if self.api_key is None:\n self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n if self.model_id is not None:\n client = InferenceClient(\n model=self.model_id, token=self.api_key.get_secret_value()\n )\n status = client.get_model_status()\n\n if (\n status.state not in {\"Loadable\", \"Loaded\"}\n and status.framework != \"text-generation-inference\"\n ):\n raise ValueError(\n f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n )\n\n self.base_url = client._resolve_url(\n model=self.model_id, task=\"text-generation\"\n )\n\n if self.endpoint_name is not None:\n client = get_inference_endpoint(\n name=self.endpoint_name,\n namespace=self.endpoint_namespace,\n token=self.api_key.get_secret_value(),\n )\n if client.status in [\"paused\", \"scaledToZero\"]:\n client.resume().wait(timeout=300)\n elif client.status == \"initializing\":\n client.wait(timeout=300)\n\n self.base_url = client.url\n self._model_name = client.repository\n\n self._aclient = AsyncInferenceClient(\n base_url=self.base_url,\n token=self.api_key.get_secret_value(),\n )\n\n if self.tokenizer_id:\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"Transformers Python client is not installed. Please install it using\"\n \" `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n conversation=input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._get_structured_output","title":"_get_structured_output(input) ","text":"Gets the structured output (if any) for the given input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required Returns: Type Description StandardInput The input and the structured output that will be passed as grammar to the Union[Dict[str, Any], None] inference endpoint or None if not required. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def _get_structured_output(\n self, input: FormattedInput\n) -> Tuple[\"StandardInput\", Union[Dict[str, Any], None]]:\n \"\"\"Gets the structured output (if any) for the given input.\n\n Args:\n input: a single input in chat format to generate responses for.\n\n Returns:\n The input and the structured output that will be passed as `grammar` to the\n inference endpoint or `None` if not required.\n \"\"\"\n structured_output = None\n\n # Specific structured output per input\n if isinstance(input, tuple):\n input, structured_output = input\n structured_output = {\n \"type\": structured_output[\"format\"], # type: ignore\n \"value\": structured_output[\"schema\"], # type: ignore\n }\n\n # Same structured output for all the inputs\n if structured_output is None and self.structured_output is not None:\n try:\n structured_output = {\n \"type\": self.structured_output[\"format\"], # type: ignore\n \"value\": self.structured_output[\"schema\"], # type: ignore\n }\n except KeyError as e:\n raise ValueError(\n \"To use the structured output you have to inform the `format` and `schema` in \"\n \"the `structured_output` attribute.\"\n ) from e\n\n if structured_output:\n if isinstance(structured_output[\"value\"], ModelMetaclass):\n structured_output[\"value\"] = structured_output[\n \"value\"\n ].model_json_schema()\n\n return input, structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._check_stop_sequences","title":"_check_stop_sequences(stop_sequences=None) ","text":"Checks that no more than 4 stop sequences are provided. Parameters: Name Type Description Default stop_sequences Optional[Union[str, List[str]]] the stop sequences to be checked. None Returns: Type Description Union[List[str], None] The stop sequences. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py def _check_stop_sequences(\n self,\n stop_sequences: Optional[Union[str, List[str]]] = None,\n) -> Union[List[str], None]:\n \"\"\"Checks that no more than 4 stop sequences are provided.\n\n Args:\n stop_sequences: the stop sequences to be checked.\n\n Returns:\n The stop sequences.\n \"\"\"\n if stop_sequences is not None:\n if isinstance(stop_sequences, str):\n stop_sequences = [stop_sequences]\n if len(stop_sequences) > 4:\n warnings.warn(\n \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n UserWarning,\n stacklevel=2,\n )\n stop_sequences = stop_sequences[:4]\n return stop_sequences\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.agenerate","title":"agenerate(input, max_new_tokens=128, frequency_penalty=None, logit_bias=None, logprobs=False, presence_penalty=None, seed=None, stop_sequences=None, temperature=1.0, tool_choice=None, tool_prompt=None, tools=None, top_logprobs=None, top_n_tokens=None, top_p=None, do_sample=False, repetition_penalty=None, return_full_text=False, top_k=None, typical_p=None, watermark=False) async ","text":"Generates completions for the given input using the async client. This method uses two methods of the huggingface_hub.AsyncClient : chat_completion and text_generation . chat_completion method will be used only if no tokenizer_id has been specified. Some arguments of this function are specific to the text_generation method, while some others are specific to the chat_completion method. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] a value between -2.0 and 2.0 . Positive values penalize new tokens based on their existing frequency in the text so far, decreasing model's likelihood to repeat the same line verbatim. Defauls to None . None logit_bias Optional[List[float]] modify the likelihood of specified tokens appearing in the completion. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None logprobs bool whether to return the log probabilities or not. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to False . False presence_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] a value between -2.0 and 2.0 . Positive values penalize new tokens based on whether they appear in the text so far, increasing the model likelihood to talk about new topics. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defauls to None . None seed Optional[int] the seed to use for the generation. Defaults to None . None stop_sequences Optional[List[str]] either a single string or a list of strings containing the sequences to stop the generation at. Defaults to None , but will be set to the tokenizer.eos_token if available. None temperature float the temperature to use for the generation. Defaults to 1.0 . 1.0 tool_choice Optional[Union[Dict[str, str], Literal['auto']]] the name of the tool the model should call. It can be a dictionary like {\"function_name\": \"my_tool\"} or \"auto\". If not provided, then the model won't use any tool. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None tool_prompt Optional[str] A prompt to be appended before the tools. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defauls to None . None tools Optional[List[Dict[str, Any]]] a list of tools definitions that the LLM can use. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None . Defaults to None . None top_n_tokens Optional[PositiveInt] the number of top log probabilities to return per output token generated. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . None do_sample bool whether to use sampling for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to False . False repetition_penalty Optional[float] the repetition penalty to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None return_full_text bool whether to return the full text of the completion or just the generated text. Defaults to False , meaning that only the generated text will be returned. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . False top_k Optional[int] the top-k value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to 0.8 , since neither 0.0 nor 1.0 are valid values in TGI. None typical_p Optional[float] the typical-p value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . None watermark bool whether to add the watermark to the generated text. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None . Defaults to None . False Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: int = 128,\n frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n logit_bias: Optional[List[float]] = None,\n logprobs: bool = False,\n presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n seed: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n temperature: float = 1.0,\n tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n tool_prompt: Optional[str] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n top_logprobs: Optional[PositiveInt] = None,\n top_n_tokens: Optional[PositiveInt] = None,\n top_p: Optional[float] = None,\n do_sample: bool = False,\n repetition_penalty: Optional[float] = None,\n return_full_text: bool = False,\n top_k: Optional[int] = None,\n typical_p: Optional[float] = None,\n watermark: bool = False,\n) -> GenerateOutput:\n \"\"\"Generates completions for the given input using the async client. This method\n uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n Some arguments of this function are specific to the `text_generation` method, while\n some others are specific to the `chat_completion` method.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on their existing frequency in the text so far, decreasing\n model's likelihood to repeat the same line verbatim. Defauls to `None`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`.\n Defaults to `None`.\n logprobs: whether to return the log probabilities or not. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defaults to `False`.\n presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n new tokens based on whether they appear in the text so far, increasing the\n model likelihood to talk about new topics. This argument is exclusive to\n the `chat_completion` method and will be used only if `tokenizer_id` is\n `None`. Defauls to `None`.\n seed: the seed to use for the generation. Defaults to `None`.\n stop_sequences: either a single string or a list of strings containing the sequences\n to stop the generation at. Defaults to `None`, but will be set to the\n `tokenizer.eos_token` if available.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n tool_choice: the name of the tool the model should call. It can be a dictionary\n like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n model won't use any tool. This argument is exclusive to the `chat_completion`\n method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n to the `chat_completion` method and will be used only if `tokenizer_id`\n is `None`. Defauls to `None`.\n tools: a list of tools definitions that the LLM can use.\n This argument is exclusive to the `chat_completion` method and will be used\n only if `tokenizer_id` is `None`. Defaults to `None`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. This argument is exclusive to the `chat_completion` method and\n will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n top_n_tokens: the number of top log probabilities to return per output token\n generated. This argument is exclusive of the `text_generation` method and\n will be only used if `tokenizer_id` is not `None`. Defaults to `None`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n do_sample: whether to use sampling for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id` is not\n `None`. Defaults to `False`.\n repetition_penalty: the repetition penalty to use for the generation. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n return_full_text: whether to return the full text of the completion or just\n the generated text. Defaults to `False`, meaning that only the generated\n text will be returned. This argument is exclusive of the `text_generation`\n method and will be only used if `tokenizer_id` is not `None`.\n top_k: the top-k value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n values in TGI.\n typical_p: the typical-p value to use for the generation. This argument is exclusive\n of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n watermark: whether to add the watermark to the generated text. This argument\n is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n is not `None`. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n stop_sequences = self._check_stop_sequences(stop_sequences)\n\n if self.tokenizer_id is None:\n return await self._generate_with_chat_completion(\n input=input, # type: ignore\n max_new_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n logprobs=logprobs,\n presence_penalty=presence_penalty,\n seed=seed,\n stop_sequences=stop_sequences,\n temperature=temperature,\n tool_choice=tool_choice,\n tool_prompt=tool_prompt,\n tools=tools,\n top_logprobs=top_logprobs,\n top_p=top_p,\n )\n\n return await self._generate_with_text_generation(\n input=input,\n max_new_tokens=max_new_tokens,\n do_sample=do_sample,\n typical_p=typical_p,\n repetition_penalty=repetition_penalty,\n frequency_penalty=frequency_penalty,\n temperature=temperature,\n top_n_tokens=top_n_tokens,\n top_p=top_p,\n top_k=top_k,\n stop_sequences=stop_sequences,\n return_full_text=return_full_text,\n seed=seed,\n watermark=watermark,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM","title":"TransformersLLM ","text":" Bases: LLM , MagpieChatTemplateMixin , CudaDevicePlacementMixin Hugging Face transformers library LLM implementation using the text generation pipeline. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. revision str if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . torch_dtype str the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . model_kwargs Optional[Dict[str, Any]] additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model. tokenizer Optional[str] the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None . use_fast bool whether to use a fast tokenizer or not. Defaults to True . chat_template Optional[str] a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . device Optional[Union[str, int]] the name or index of the device where the model will be loaded. Defaults to None . device_map Optional[Union[str, Dict[str, Any]]] a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . token Optional[SecretStr] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. use_magpie_template Optional[RuntimeParameter[OutlinesStructuredOutputType]] a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template Optional[RuntimeParameter[OutlinesStructuredOutputType]] the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . Icon :hugging: Examples: Generate text: from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/huggingface/transformers.py class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n \"\"\"Hugging Face `transformers` library LLM implementation using the text generation\n pipeline.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n Defaults to `\"auto\"`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n model_kwargs: additional dictionary of keyword arguments that will be passed to\n the `from_pretrained` method of the model.\n tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n the tokenizer config files. If not provided, the one associated to the `model`\n will be used. Defaults to `None`.\n use_fast: whether to use a fast tokenizer or not. Defaults to `True`.\n chat_template: a chat template that will be used to build the prompts before\n sending them to the model. If not provided, the chat template defined in the\n tokenizer config will be used. If not provided and the tokenizer doesn't have\n a chat template, then ChatML template will be used. Defaults to `None`.\n device: the name or index of the device where the model will be loaded. Defaults\n to `None`.\n device_map: a dictionary mapping each layer of the model to a device, or a mode\n like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n\n Icon:\n `:hugging:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import TransformersLLM\n\n llm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n revision: str = \"main\"\n torch_dtype: str = \"auto\"\n trust_remote_code: bool = False\n model_kwargs: Optional[Dict[str, Any]] = None\n tokenizer: Optional[str] = None\n use_fast: bool = True\n chat_template: Optional[str] = None\n device: Optional[Union[str, int]] = None\n device_map: Optional[Union[str, Dict[str, Any]]] = None\n token: Optional[SecretStr] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR)\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _pipeline: Optional[\"Pipeline\"] = PrivateAttr(...)\n _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)\n\n def load(self) -> None:\n \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n it will configure the tokenizer chat template.\"\"\"\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import pipeline\n except ImportError as ie:\n raise ImportError(\n \"Transformers is not installed. Please install it using `pip install transformers`.\"\n ) from ie\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._pipeline = pipeline(\n \"text-generation\",\n model=self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n model_kwargs=self.model_kwargs or {},\n tokenizer=self.tokenizer or self.model,\n use_fast=self.use_fast,\n device=self.device,\n device_map=self.device_map,\n token=token,\n return_full_text=False,\n )\n\n if self.chat_template is not None:\n self._pipeline.tokenizer.chat_template = self.chat_template # type: ignore\n\n if self._pipeline.tokenizer.pad_token is None: # type: ignore\n self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token # type: ignore\n\n if self.structured_output:\n self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n self.structured_output\n )\n\n super().load()\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._pipeline.tokenizer.chat_template: # type: ignore\n return input[0][\"content\"]\n\n prompt: str = (\n self._pipeline.tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n @validate_call\n def generate( # type: ignore\n self,\n inputs: List[StandardInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n temperature: float = 0.1,\n repetition_penalty: float = 1.1,\n top_p: float = 1.0,\n top_k: int = 0,\n do_sample: bool = True,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input using the text generation\n pipeline.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n repetition_penalty: the repetition penalty to use for the generation. Defaults\n to `1.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n do_sample: whether to use sampling or not. Defaults to `True`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n outputs: List[List[Dict[str, str]]] = self._pipeline( # type: ignore\n prepared_inputs,\n max_new_tokens=max_new_tokens,\n temperature=temperature,\n repetition_penalty=repetition_penalty,\n top_p=top_p,\n top_k=top_k,\n do_sample=do_sample,\n num_return_sequences=num_generations,\n prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n pad_token_id=self._pipeline.tokenizer.eos_token_id, # type: ignore\n )\n llm_output = [\n [generation[\"generated_text\"] for generation in output]\n for output in outputs\n ]\n\n result = []\n for input, output in zip(inputs, llm_output):\n result.append(\n prepare_output(\n output,\n input_tokens=[\n compute_tokens(input, self._pipeline.tokenizer.encode)\n ],\n output_tokens=[\n compute_tokens(row, self._pipeline.tokenizer.encode)\n for row in output\n ],\n )\n )\n\n return result\n\n def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n ) -> List[\"HiddenState\"]:\n \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n execute the task head.\n\n Args:\n inputs: a list of inputs in chat format to generate the embeddings for.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n model: \"PreTrainedModel\" = (\n self._pipeline.model.model # type: ignore\n if hasattr(self._pipeline.model, \"model\") # type: ignore\n else next(self._pipeline.model.children()) # type: ignore\n )\n tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer # type: ignore\n input_ids = tokenizer(\n [self.prepare_input(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n ).to(model.device)\n last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n return [\n seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n for seq_last_hidden_state, attention_mask in zip(\n last_hidden_states,\n input_ids[\"attention_mask\"], # type: ignore\n )\n ]\n\n def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n ) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(\n structured_output, \"transformers\", self._pipeline\n )\n if schema := result.get(\"schema\"):\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.load","title":"load() ","text":"Loads the model and tokenizer and creates the text generation pipeline. In addition, it will configure the tokenizer chat template. Source code in src/distilabel/models/llms/huggingface/transformers.py def load(self) -> None:\n \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n it will configure the tokenizer chat template.\"\"\"\n if self.device == \"cuda\":\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import pipeline\n except ImportError as ie:\n raise ImportError(\n \"Transformers is not installed. Please install it using `pip install transformers`.\"\n ) from ie\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._pipeline = pipeline(\n \"text-generation\",\n model=self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n model_kwargs=self.model_kwargs or {},\n tokenizer=self.tokenizer or self.model,\n use_fast=self.use_fast,\n device=self.device,\n device_map=self.device_map,\n token=token,\n return_full_text=False,\n )\n\n if self.chat_template is not None:\n self._pipeline.tokenizer.chat_template = self.chat_template # type: ignore\n\n if self._pipeline.tokenizer.pad_token is None: # type: ignore\n self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token # type: ignore\n\n if self.structured_output:\n self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n self.structured_output\n )\n\n super().load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/llms/huggingface/transformers.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/huggingface/transformers.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._pipeline.tokenizer.chat_template: # type: ignore\n return input[0][\"content\"]\n\n prompt: str = (\n self._pipeline.tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True,\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, temperature=0.1, repetition_penalty=1.1, top_p=1.0, top_k=0, do_sample=True) ","text":"Generates num_generations responses for each input using the text generation pipeline. Parameters: Name Type Description Default inputs List[StandardInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 temperature float the temperature to use for the generation. Defaults to 0.1 . 0.1 repetition_penalty float the repetition penalty to use for the generation. Defaults to 1.1 . 1.1 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 top_k int the top-k value to use for the generation. Defaults to 0 . 0 do_sample bool whether to use sampling or not. Defaults to True . True Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/huggingface/transformers.py @validate_call\ndef generate( # type: ignore\n self,\n inputs: List[StandardInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n temperature: float = 0.1,\n repetition_penalty: float = 1.1,\n top_p: float = 1.0,\n top_k: int = 0,\n do_sample: bool = True,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input using the text generation\n pipeline.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n repetition_penalty: the repetition penalty to use for the generation. Defaults\n to `1.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n do_sample: whether to use sampling or not. Defaults to `True`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n outputs: List[List[Dict[str, str]]] = self._pipeline( # type: ignore\n prepared_inputs,\n max_new_tokens=max_new_tokens,\n temperature=temperature,\n repetition_penalty=repetition_penalty,\n top_p=top_p,\n top_k=top_k,\n do_sample=do_sample,\n num_return_sequences=num_generations,\n prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n pad_token_id=self._pipeline.tokenizer.eos_token_id, # type: ignore\n )\n llm_output = [\n [generation[\"generated_text\"] for generation in output]\n for output in outputs\n ]\n\n result = []\n for input, output in zip(inputs, llm_output):\n result.append(\n prepare_output(\n output,\n input_tokens=[\n compute_tokens(input, self._pipeline.tokenizer.encode)\n ],\n output_tokens=[\n compute_tokens(row, self._pipeline.tokenizer.encode)\n for row in output\n ],\n )\n )\n\n return result\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.get_last_hidden_states","title":"get_last_hidden_states(inputs) ","text":"Gets the last hidden_states of the model for the given inputs. It doesn't execute the task head. Parameters: Name Type Description Default inputs List[StandardInput] a list of inputs in chat format to generate the embeddings for. required Returns: Type Description List[HiddenState] A list containing the last hidden state for each sequence using a NumPy array List[HiddenState] with shape [num_tokens, hidden_size]. Source code in src/distilabel/models/llms/huggingface/transformers.py def get_last_hidden_states(\n self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n execute the task head.\n\n Args:\n inputs: a list of inputs in chat format to generate the embeddings for.\n\n Returns:\n A list containing the last hidden state for each sequence using a NumPy array\n with shape [num_tokens, hidden_size].\n \"\"\"\n model: \"PreTrainedModel\" = (\n self._pipeline.model.model # type: ignore\n if hasattr(self._pipeline.model, \"model\") # type: ignore\n else next(self._pipeline.model.children()) # type: ignore\n )\n tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer # type: ignore\n input_ids = tokenizer(\n [self.prepare_input(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n ).to(model.device)\n last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n return [\n seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n for seq_last_hidden_state, attention_mask in zip(\n last_hidden_states,\n input_ids[\"attention_mask\"], # type: ignore\n )\n ]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output Optional[OutlinesStructuredOutputType] the configuration dict to prepare the structured output. None Returns: Type Description Union[Callable, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/huggingface/transformers.py def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(\n structured_output, \"transformers\", self._pipeline\n )\n if schema := result.get(\"schema\"):\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM","title":"LiteLLM ","text":" Bases: AsyncLLM LiteLLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc. verbose RuntimeParameter[bool] whether to log the LiteLLM client's logs. Defaults to False . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . Runtime parameters verbose : whether to log the LiteLLM client's logs. Defaults to False . Examples: Generate text: from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import LiteLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LiteLLM(\n model=\"gpt-3.5-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/litellm.py class LiteLLM(AsyncLLM):\n \"\"\"LiteLLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\",\n etc.\n verbose: whether to log the LiteLLM client's logs. Defaults to `False`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n Runtime parameters:\n - `verbose`: whether to log the LiteLLM client's logs. Defaults to `False`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import LiteLLM\n\n llm = LiteLLM(model=\"gpt-3.5-turbo\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import LiteLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = LiteLLM(\n model=\"gpt-3.5-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n verbose: RuntimeParameter[bool] = Field(\n default=False, description=\"Whether to log the LiteLLM client's logs.\"\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _aclient: Optional[Callable] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"\n Loads the `acompletion` LiteLLM client to benefit from async requests.\n \"\"\"\n super().load()\n\n try:\n import litellm\n\n litellm.telemetry = False\n except ImportError as e:\n raise ImportError(\n \"LiteLLM Python client is not installed. Please install it using\"\n \" `pip install litellm`.\"\n ) from e\n self._aclient = litellm.acompletion\n\n if not self.verbose:\n litellm.suppress_debug_info = True\n for key in logging.Logger.manager.loggerDict.keys():\n if \"litellm\" not in key.lower():\n continue\n logging.getLogger(key).setLevel(logging.CRITICAL)\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore # noqa: C901\n self,\n input: FormattedInput,\n num_generations: int = 1,\n functions: Optional[List] = None,\n function_call: Optional[str] = None,\n temperature: Optional[float] = 1.0,\n top_p: Optional[float] = 1.0,\n stop: Optional[Union[str, list]] = None,\n max_tokens: Optional[int] = None,\n presence_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[dict] = None,\n user: Optional[str] = None,\n metadata: Optional[dict] = None,\n api_base: Optional[str] = None,\n api_version: Optional[str] = None,\n api_key: Optional[str] = None,\n model_list: Optional[list] = None,\n mock_response: Optional[str] = None,\n force_timeout: Optional[int] = 600,\n custom_llm_provider: Optional[str] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n functions: a list of functions to apply to the conversation messages. Defaults to\n `None`.\n function_call: the name of the function to call within the conversation. Defaults\n to `None`.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n Defaults to `None`.\n max_tokens: The maximum number of tokens in the generated completion. Defaults to\n `None`.\n presence_penalty: It is used to penalize new tokens based on their existence in the\n text so far. Defaults to `None`.\n frequency_penalty: It is used to penalize new tokens based on their frequency in the\n text so far. Defaults to `None`.\n logit_bias: Used to modify the probability of specific tokens appearing in the\n completion. Defaults to `None`.\n user: A unique identifier representing your end-user. This can help the LLM provider\n to monitor and detect abuse. Defaults to `None`.\n metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n version, details, etc. Defaults to `None`.\n api_base: Base URL for the API. Defaults to `None`.\n api_version: API version. Defaults to `None`.\n api_key: API key. Defaults to `None`.\n model_list: List of api base, version, keys. Defaults to `None`.\n mock_response: If provided, return a mock completion response for testing or debugging\n purposes. Defaults to `None`.\n force_timeout: The maximum execution time in seconds for the completion request.\n Defaults to `600`.\n custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n import litellm\n from litellm import token_counter\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"model\": self.model,\n \"messages\": input,\n \"n\": num_generations,\n \"functions\": functions,\n \"function_call\": function_call,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n \"max_tokens\": max_tokens,\n \"presence_penalty\": presence_penalty,\n \"frequency_penalty\": frequency_penalty,\n \"logit_bias\": logit_bias,\n \"user\": user,\n \"metadata\": metadata,\n \"api_base\": api_base,\n \"api_version\": api_version,\n \"api_key\": api_key,\n \"model_list\": model_list,\n \"mock_response\": mock_response,\n \"force_timeout\": force_timeout,\n \"custom_llm_provider\": custom_llm_provider,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n choices = []\n while len(choices) < num_generations:\n completion = await self._aclient(**kwargs) # type: ignore\n if not self.structured_output:\n completion = completion.choices\n choices.extend(completion)\n return choices\n\n # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n try:\n litellm.drop_params = False\n choices = await _call_aclient_until_n_choices()\n except litellm.exceptions.APIError as e:\n if \"does not support parameters\" in str(e):\n litellm.drop_params = True\n choices = await _call_aclient_until_n_choices()\n else:\n raise e\n\n generations = []\n input_tokens = [\n token_counter(model=self.model, messages=input)\n ] * num_generations\n output_tokens = []\n\n if self.structured_output:\n for choice in choices:\n generations.append(choice.model_dump_json())\n output_tokens.append(\n token_counter(\n model=self.model,\n text=orjson.dumps(choice.model_dump_json()).decode(\"utf-8\"),\n )\n )\n return prepare_output(\n generations,\n input_tokens=input_tokens,\n output_tokens=output_tokens,\n )\n\n for choice in choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using LiteLLM client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n output_tokens.append(token_counter(model=self.model, text=content))\n\n return prepare_output(\n generations, input_tokens=input_tokens, output_tokens=output_tokens\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.load","title":"load() ","text":"Loads the acompletion LiteLLM client to benefit from async requests. Source code in src/distilabel/models/llms/litellm.py def load(self) -> None:\n \"\"\"\n Loads the `acompletion` LiteLLM client to benefit from async requests.\n \"\"\"\n super().load()\n\n try:\n import litellm\n\n litellm.telemetry = False\n except ImportError as e:\n raise ImportError(\n \"LiteLLM Python client is not installed. Please install it using\"\n \" `pip install litellm`.\"\n ) from e\n self._aclient = litellm.acompletion\n\n if not self.verbose:\n litellm.suppress_debug_info = True\n for key in logging.Logger.manager.loggerDict.keys():\n if \"litellm\" not in key.lower():\n continue\n logging.getLogger(key).setLevel(logging.CRITICAL)\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.agenerate","title":"agenerate(input, num_generations=1, functions=None, function_call=None, temperature=1.0, top_p=1.0, stop=None, max_tokens=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, user=None, metadata=None, api_base=None, api_version=None, api_key=None, model_list=None, mock_response=None, force_timeout=600, custom_llm_provider=None) async ","text":"Generates num_generations responses for the given input using the LiteLLM async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 functions Optional[List] a list of functions to apply to the conversation messages. Defaults to None . None function_call Optional[str] the name of the function to call within the conversation. Defaults to None . None temperature Optional[float] the temperature to use for the generation. Defaults to 1.0 . 1.0 top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, list]] Up to 4 sequences where the LLM API will stop generating further tokens. Defaults to None . None max_tokens Optional[int] The maximum number of tokens in the generated completion. Defaults to None . None presence_penalty Optional[float] It is used to penalize new tokens based on their existence in the text so far. Defaults to None . None frequency_penalty Optional[float] It is used to penalize new tokens based on their frequency in the text so far. Defaults to None . None logit_bias Optional[dict] Used to modify the probability of specific tokens appearing in the completion. Defaults to None . None user Optional[str] A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse. Defaults to None . None metadata Optional[dict] Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. Defaults to None . None api_base Optional[str] Base URL for the API. Defaults to None . None api_version Optional[str] API version. Defaults to None . None api_key Optional[str] API key. Defaults to None . None model_list Optional[list] List of api base, version, keys. Defaults to None . None mock_response Optional[str] If provided, return a mock completion response for testing or debugging purposes. Defaults to None . None force_timeout Optional[int] The maximum execution time in seconds for the completion request. Defaults to 600 . 600 custom_llm_provider Optional[str] Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable) model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/litellm.py @validate_call\nasync def agenerate( # type: ignore # noqa: C901\n self,\n input: FormattedInput,\n num_generations: int = 1,\n functions: Optional[List] = None,\n function_call: Optional[str] = None,\n temperature: Optional[float] = 1.0,\n top_p: Optional[float] = 1.0,\n stop: Optional[Union[str, list]] = None,\n max_tokens: Optional[int] = None,\n presence_penalty: Optional[float] = None,\n frequency_penalty: Optional[float] = None,\n logit_bias: Optional[dict] = None,\n user: Optional[str] = None,\n metadata: Optional[dict] = None,\n api_base: Optional[str] = None,\n api_version: Optional[str] = None,\n api_key: Optional[str] = None,\n model_list: Optional[list] = None,\n mock_response: Optional[str] = None,\n force_timeout: Optional[int] = 600,\n custom_llm_provider: Optional[str] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n functions: a list of functions to apply to the conversation messages. Defaults to\n `None`.\n function_call: the name of the function to call within the conversation. Defaults\n to `None`.\n temperature: the temperature to use for the generation. Defaults to `1.0`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n Defaults to `None`.\n max_tokens: The maximum number of tokens in the generated completion. Defaults to\n `None`.\n presence_penalty: It is used to penalize new tokens based on their existence in the\n text so far. Defaults to `None`.\n frequency_penalty: It is used to penalize new tokens based on their frequency in the\n text so far. Defaults to `None`.\n logit_bias: Used to modify the probability of specific tokens appearing in the\n completion. Defaults to `None`.\n user: A unique identifier representing your end-user. This can help the LLM provider\n to monitor and detect abuse. Defaults to `None`.\n metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n version, details, etc. Defaults to `None`.\n api_base: Base URL for the API. Defaults to `None`.\n api_version: API version. Defaults to `None`.\n api_key: API key. Defaults to `None`.\n model_list: List of api base, version, keys. Defaults to `None`.\n mock_response: If provided, return a mock completion response for testing or debugging\n purposes. Defaults to `None`.\n force_timeout: The maximum execution time in seconds for the completion request.\n Defaults to `600`.\n custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n import litellm\n from litellm import token_counter\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"litellm\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"model\": self.model,\n \"messages\": input,\n \"n\": num_generations,\n \"functions\": functions,\n \"function_call\": function_call,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stream\": False,\n \"stop\": stop,\n \"max_tokens\": max_tokens,\n \"presence_penalty\": presence_penalty,\n \"frequency_penalty\": frequency_penalty,\n \"logit_bias\": logit_bias,\n \"user\": user,\n \"metadata\": metadata,\n \"api_base\": api_base,\n \"api_version\": api_version,\n \"api_key\": api_key,\n \"model_list\": model_list,\n \"mock_response\": mock_response,\n \"force_timeout\": force_timeout,\n \"custom_llm_provider\": custom_llm_provider,\n }\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n choices = []\n while len(choices) < num_generations:\n completion = await self._aclient(**kwargs) # type: ignore\n if not self.structured_output:\n completion = completion.choices\n choices.extend(completion)\n return choices\n\n # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n try:\n litellm.drop_params = False\n choices = await _call_aclient_until_n_choices()\n except litellm.exceptions.APIError as e:\n if \"does not support parameters\" in str(e):\n litellm.drop_params = True\n choices = await _call_aclient_until_n_choices()\n else:\n raise e\n\n generations = []\n input_tokens = [\n token_counter(model=self.model, messages=input)\n ] * num_generations\n output_tokens = []\n\n if self.structured_output:\n for choice in choices:\n generations.append(choice.model_dump_json())\n output_tokens.append(\n token_counter(\n model=self.model,\n text=orjson.dumps(choice.model_dump_json()).decode(\"utf-8\"),\n )\n )\n return prepare_output(\n generations,\n input_tokens=input_tokens,\n output_tokens=output_tokens,\n )\n\n for choice in choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using LiteLLM client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n output_tokens.append(token_counter(model=self.model, text=content))\n\n return prepare_output(\n generations, input_tokens=input_tokens, output_tokens=output_tokens\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM","title":"LlamaCppLLM ","text":" Bases: LLM llama.cpp LLM implementation running the Python bindings for the C++ code. Attributes: Name Type Description model_path RuntimeParameter[FilePath] contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings. n_gpu_layers RuntimeParameter[int] the number of layers to use for the GPU. Defaults to -1 , meaning that the available GPU device will be used. chat_format Optional[RuntimeParameter[str]] the chat format to use for the model. Defaults to None , which means the Llama format will be used. n_ctx int the context size to use for the model. Defaults to 512 . n_batch int the prompt processing maximum batch size to use for the model. Defaults to 512 . seed int random seed to use for the generation. Defaults to 4294967295 . verbose RuntimeParameter[bool] whether to print verbose output. Defaults to False . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . _model Optional[Llama] the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. Runtime parameters model_path : the path to the GGUF quantized model. n_gpu_layers : the number of layers to use for the GPU. Defaults to -1 . chat_format : the chat format to use for the model. Defaults to None . verbose : whether to print verbose output. Defaults to False . extra_kwargs : additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . References llama.cpp llama-cpp-python Examples: Generate text: from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/llamacpp.py class LlamaCppLLM(LLM):\n \"\"\"llama.cpp LLM implementation running the Python bindings for the C++ code.\n\n Attributes:\n model_path: contains the path to the GGUF quantized model, compatible with the\n installed version of the `llama.cpp` Python bindings.\n n_gpu_layers: the number of layers to use for the GPU. Defaults to `-1`, meaning that\n the available GPU device will be used.\n chat_format: the chat format to use for the model. Defaults to `None`, which means the\n Llama format will be used.\n n_ctx: the context size to use for the model. Defaults to `512`.\n n_batch: the prompt processing maximum batch size to use for the model. Defaults to `512`.\n seed: random seed to use for the generation. Defaults to `4294967295`.\n verbose: whether to print verbose output. Defaults to `False`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `Llama` class of `llama_cpp` library. Defaults to `{}`.\n _model: the Llama model instance. This attribute is meant to be used internally and\n should not be accessed directly. It will be set in the `load` method.\n\n Runtime parameters:\n - `model_path`: the path to the GGUF quantized model.\n - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.\n - `chat_format`: the chat format to use for the model. Defaults to `None`.\n - `verbose`: whether to print verbose output. Defaults to `False`.\n - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the\n `Llama` class of `llama_cpp` library. Defaults to `{}`.\n\n References:\n - [`llama.cpp`](https://github.com/ggerganov/llama.cpp)\n - [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python)\n\n Examples:\n Generate text:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import LlamaCppLLM\n\n # You can follow along this example downloading the following model running the following\n # command in the terminal, that will download the model to the `Downloads` folder:\n # curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\n model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import LlamaCppLLM\n\n model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model_path: RuntimeParameter[FilePath] = Field(\n default=None, description=\"The path to the GGUF quantized model.\", exclude=True\n )\n n_gpu_layers: RuntimeParameter[int] = Field(\n default=-1,\n description=\"The number of layers that will be loaded in the GPU.\",\n )\n chat_format: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The chat format to use for the model. Defaults to `None`, which means the Llama format will be used.\",\n )\n\n n_ctx: int = 512\n n_batch: int = 512\n seed: int = 4294967295\n verbose: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to print verbose output from llama.cpp library.\",\n )\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `Llama` class of `llama_cpp` library. See all the supported arguments at: \"\n \"https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__\",\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _logits_processor: Optional[\"LogitsProcessorList\"] = PrivateAttr(default=None)\n _model: Optional[\"Llama\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n try:\n from llama_cpp import Llama\n except ImportError as ie:\n raise ImportError(\n \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n ) from ie\n\n self._model = Llama(\n model_path=self.model_path.as_posix(), # type: ignore\n seed=self.seed,\n n_ctx=self.n_ctx,\n n_batch=self.n_batch,\n chat_format=self.chat_format,\n n_gpu_layers=self.n_gpu_layers,\n verbose=self.verbose,\n **self.extra_kwargs,\n )\n\n if self.structured_output:\n self._logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n # out of the model name, which won't be available until the `Llama` instance is created.\n super().load()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self._model.model_path # type: ignore\n\n @validate_call\n def generate( # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n extra_generation_kwargs: dictionary with additional arguments to be passed to\n the `create_chat_completion` method. Reference at\n https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n batch_outputs = []\n for input in inputs:\n if isinstance(input, tuple):\n input, structured_output = input\n elif self.structured_output:\n structured_output = self.structured_output\n\n outputs = []\n output_tokens = []\n for _ in range(num_generations):\n # NOTE(plaguss): There seems to be a bug in how the logits processor\n # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n # after each generation, so subsequent calls yield nothing. This is a workaround\n # until is fixed in the `llama_cpp` or `outlines` libraries.\n if structured_output:\n self._logits_processor = self._prepare_structured_output(\n structured_output\n )\n chat_completions: \"CreateChatCompletionResponse\" = (\n self._model.create_chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n logits_processor=self._logits_processor,\n **(extra_generation_kwargs or {}),\n )\n )\n outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n output_tokens.append(chat_completions[\"usage\"][\"completion_tokens\"])\n batch_outputs.append(\n prepare_output(\n outputs,\n input_tokens=[chat_completions[\"usage\"][\"prompt_tokens\"]]\n * num_generations,\n output_tokens=output_tokens,\n )\n )\n\n return batch_outputs\n\n def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n ) -> Union[\"LogitsProcessorList\", None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.load","title":"load() ","text":"Loads the Llama model from the model_path . Source code in src/distilabel/models/llms/llamacpp.py def load(self) -> None:\n \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n try:\n from llama_cpp import Llama\n except ImportError as ie:\n raise ImportError(\n \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n ) from ie\n\n self._model = Llama(\n model_path=self.model_path.as_posix(), # type: ignore\n seed=self.seed,\n n_ctx=self.n_ctx,\n n_batch=self.n_batch,\n chat_format=self.chat_format,\n n_gpu_layers=self.n_gpu_layers,\n verbose=self.verbose,\n **self.extra_kwargs,\n )\n\n if self.structured_output:\n self._logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n # out of the model name, which won't be available until the `Llama` instance is created.\n super().load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, extra_generation_kwargs=None) ","text":"Generates num_generations responses for the given input using the Llama model. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 extra_generation_kwargs Optional[Dict[str, Any]] dictionary with additional arguments to be passed to the create_chat_completion method. Reference at https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/llamacpp.py @validate_call\ndef generate( # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n extra_generation_kwargs: dictionary with additional arguments to be passed to\n the `create_chat_completion` method. Reference at\n https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n batch_outputs = []\n for input in inputs:\n if isinstance(input, tuple):\n input, structured_output = input\n elif self.structured_output:\n structured_output = self.structured_output\n\n outputs = []\n output_tokens = []\n for _ in range(num_generations):\n # NOTE(plaguss): There seems to be a bug in how the logits processor\n # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n # after each generation, so subsequent calls yield nothing. This is a workaround\n # until is fixed in the `llama_cpp` or `outlines` libraries.\n if structured_output:\n self._logits_processor = self._prepare_structured_output(\n structured_output\n )\n chat_completions: \"CreateChatCompletionResponse\" = (\n self._model.create_chat_completion( # type: ignore\n messages=input, # type: ignore\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n logits_processor=self._logits_processor,\n **(extra_generation_kwargs or {}),\n )\n )\n outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n output_tokens.append(chat_completions[\"usage\"][\"completion_tokens\"])\n batch_outputs.append(\n prepare_output(\n outputs,\n input_tokens=[chat_completions[\"usage\"][\"prompt_tokens\"]]\n * num_generations,\n output_tokens=output_tokens,\n )\n )\n\n return batch_outputs\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output Optional[OutlinesStructuredOutputType] the configuration dict to prepare the structured output. None Returns: Type Description Union[LogitsProcessorList, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/llamacpp.py def _prepare_structured_output(\n self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[\"LogitsProcessorList\", None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM","title":"MistralLLM ","text":" Bases: AsyncLLM Mistral LLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc. endpoint str the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\". api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. max_retries RuntimeParameter[int] the maximum number of retries to attempt when a request fails. Defaults to 5 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response. Defaults to 120 . max_concurrent_requests RuntimeParameter[int] the maximum number of concurrent requests to send. Defaults to 64 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. _aclient Optional[Mistral] the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method. Runtime parameters api_key : the API key to authenticate the requests to the Mistral API. max_retries : the maximum number of retries to attempt when a request fails. Defaults to 5 . timeout : the maximum time in seconds to wait for a response. Defaults to 120 . max_concurrent_requests : the maximum number of concurrent requests to send. Defaults to 64 . Examples: Generate text: from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import MistralLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = MistralLLM(\n model=\"open-mixtral-8x22b\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/mistral.py class MistralLLM(AsyncLLM):\n \"\"\"Mistral LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.\n endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".\n api_key: the API key to authenticate the requests to the Mistral API. Defaults to `None` which\n means that the value set for the environment variable `OPENAI_API_KEY` will be used, or\n `None` if not set.\n max_retries: the maximum number of retries to attempt when a request fails. Defaults to `5`.\n timeout: the maximum time in seconds to wait for a response. Defaults to `120`.\n max_concurrent_requests: the maximum number of concurrent requests to send. Defaults\n to `64`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n _api_key_env_var: the name of the environment variable to use for the API key. It is meant to\n be used internally.\n _aclient: the `Mistral` to use for the Mistral API. It is meant to be used internally.\n Set in the `load` method.\n\n Runtime parameters:\n - `api_key`: the API key to authenticate the requests to the Mistral API.\n - `max_retries`: the maximum number of retries to attempt when a request fails.\n Defaults to `5`.\n - `timeout`: the maximum time in seconds to wait for a response. Defaults to `120`.\n - `max_concurrent_requests`: the maximum number of concurrent requests to send.\n Defaults to `64`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import MistralLLM\n\n llm = MistralLLM(model=\"open-mixtral-8x22b\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import MistralLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = MistralLLM(\n model=\"open-mixtral-8x22b\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n endpoint: str = \"https://api.mistral.ai\"\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_MISTRALAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Mistral API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n max_concurrent_requests: RuntimeParameter[int] = Field(\n default=64, description=\"The maximum number of concurrent requests to send.\"\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _api_key_env_var: str = PrivateAttr(_MISTRALAI_API_KEY_ENV_VAR_NAME)\n _aclient: Optional[\"Mistral\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from mistralai import Mistral\n except ImportError as ie:\n raise ImportError(\n \"MistralAI Python client is not installed. Please install it using\"\n \" `pip install mistralai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = Mistral(\n api_key=self.api_key.get_secret_value(),\n endpoint=self.endpoint,\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout, # type: ignore\n max_concurrent_requests=self.max_concurrent_requests, # type: ignore\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n # TODO: add `num_generations` parameter once Mistral client allows `n` parameter\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: Optional[int] = None,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"max_tokens\": max_new_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n generations = []\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n # We need to check instructor and see if we can create a PR.\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n else:\n # completion = await self._aclient.chat(**kwargs) # type: ignore\n completion = await self._aclient.chat.complete_async(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using MistralAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: \"ChatCompletionResponse\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion.usage.prompt_tokens],\n \"output_tokens\": [completion.usage.completion_tokens],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.load","title":"load() ","text":"Loads the Mistral client to benefit from async requests. Source code in src/distilabel/models/llms/mistral.py def load(self) -> None:\n \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from mistralai import Mistral\n except ImportError as ie:\n raise ImportError(\n \"MistralAI Python client is not installed. Please install it using\"\n \" `pip install mistralai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._aclient = Mistral(\n api_key=self.api_key.get_secret_value(),\n endpoint=self.endpoint,\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout, # type: ignore\n max_concurrent_requests=self.max_concurrent_requests, # type: ignore\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.agenerate","title":"agenerate(input, max_new_tokens=None, temperature=None, top_p=None) async ","text":"Generates num_generations responses for the given input using the MistralAI async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required max_new_tokens Optional[int] the maximum number of new tokens that the model will generate. Defaults to 128 . None temperature Optional[float] the temperature to use for the generation. Defaults to 0.1 . None top_p Optional[float] the top-p value to use for the generation. Defaults to 1.0 . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/mistral.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n max_new_tokens: Optional[int] = None,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output,\n client=self._aclient,\n framework=\"mistral\",\n )\n self._aclient = result.get(\"client\")\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"max_tokens\": max_new_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n generations = []\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output)\n # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n # We need to check instructor and see if we can create a PR.\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n else:\n # completion = await self._aclient.chat(**kwargs) # type: ignore\n completion = await self._aclient.chat.complete_async(**kwargs) # type: ignore\n\n if structured_output:\n return prepare_output(\n [completion.model_dump_json()],\n **self._get_llm_statistics(completion._raw_response),\n )\n\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using MistralAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM","title":"MixtureOfAgentsLLM ","text":" Bases: AsyncLLM Mixture-of-Agents implementation. An LLM class that leverages LLM s collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLM s proposing/generating outputs that LLM s from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response. Attributes: Name Type Description aggregator_llm LLM The LLM that aggregates the outputs of the proposer LLM s. proposers_llms List[AsyncLLM] The list of LLM s that propose outputs to be aggregated. rounds int The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1 . References - Mixture-of-Agents Enhances Large Language Model Capabilities
Examples: Generate text: from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n)\n Source code in src/distilabel/models/llms/moa.py class MixtureOfAgentsLLM(AsyncLLM):\n \"\"\"`Mixture-of-Agents` implementation.\n\n An `LLM` class that leverages `LLM`s collective strenghts to generate a response,\n as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\"\n paper. There is a list of `LLM`s proposing/generating outputs that `LLM`s from the next\n round/layer can use as auxiliary information. Finally, there is an `LLM` that aggregates\n the outputs to generate the final response.\n\n Attributes:\n aggregator_llm: The `LLM` that aggregates the outputs of the proposer `LLM`s.\n proposers_llms: The list of `LLM`s that propose outputs to be aggregated.\n rounds: The number of layers or rounds that the `proposers_llms` will generate\n outputs. Defaults to `1`.\n\n References:\n - [Mixture-of-Agents Enhances Large Language Model Capabilities](https://arxiv.org/abs/2406.04692)\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\n llm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n )\n\n llm.load()\n\n output = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n )\n ```\n \"\"\"\n\n aggregator_llm: LLM\n proposers_llms: List[AsyncLLM] = Field(default_factory=list)\n rounds: int = 1\n\n @property\n def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n \"\"\"Returns the runtime parameters of the `LLM`, which are a combination of the\n `RuntimeParameter`s of the `LLM`, the `aggregator_llm` and the `proposers_llms`.\n\n Returns:\n The runtime parameters of the `LLM`.\n \"\"\"\n runtime_parameters_names = super().runtime_parameters_names\n del runtime_parameters_names[\"generation_kwargs\"]\n return runtime_parameters_names\n\n def load(self) -> None:\n \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n super().load()\n\n for llm in self.proposers_llms:\n self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\") # type: ignore\n llm.load()\n\n self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\") # type: ignore\n self.aggregator_llm.load()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the aggregated model name.\"\"\"\n return f\"moa-{self.aggregator_llm.model_name}-{'-'.join([llm.model_name for llm in self.proposers_llms])}\"\n\n def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n Returns:\n The generation kwargs of the `MixtureOfAgents`.\n \"\"\"\n return {\n \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n \"proposers_llms\": [\n llm.get_generation_kwargs() for llm in self.proposers_llms\n ],\n }\n\n # `abstractmethod`, had to be implemented but not used\n async def agenerate(\n self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n ) -> List[Union[str, None]]:\n raise NotImplementedError(\n \"`agenerate` method is not implemented for `MixtureOfAgents`\"\n )\n\n def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n \"\"\"Builds the Mixture-of-Agents system prompt.\n\n Args:\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The Mixture-of-Agents system prompt.\n \"\"\"\n moa_system_prompt = MOA_SYSTEM_PROMPT\n for i, prev_output in enumerate(prev_outputs):\n if prev_output is not None:\n moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n return moa_system_prompt\n\n def _inject_moa_system_prompt(\n self, input: \"StandardInput\", prev_outputs: List[str]\n ) -> \"StandardInput\":\n \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n Args:\n input: The input to inject the system prompt into.\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The input with the Mixture-of-Agents system prompt injected.\n \"\"\"\n if len(prev_outputs) == 0:\n return input\n\n moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n system = next((item for item in input if item[\"role\"] == \"system\"), None)\n if system:\n original_system_prompt = system[\"content\"]\n system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n else:\n input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n return input\n\n async def _agenerate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n \"proposers_llms\", [{}] * len(self.proposers_llms)\n )\n\n prev_outputs = []\n for round in range(self.rounds):\n self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\") # type: ignore\n\n # Generate `num_generations` with each proposer LLM for each input\n tasks = [\n asyncio.create_task(\n llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in itertools.zip_longest(\n inputs, prev_outputs, fillvalue=[]\n )\n ],\n num_generations=1,\n **generation_kwargs,\n )\n )\n for llm, generation_kwargs in zip(\n self.proposers_llms, proposers_llms_kwargs\n )\n ]\n\n # Group generations per input\n outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n prev_outputs = [\n list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n ]\n\n self._logger.debug(\"Aggregating outputs in MoA\") # type: ignore\n if isinstance(self.aggregator_llm, AsyncLLM):\n return await self.aggregator_llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n\n return self.aggregator_llm.generate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property ","text":"Returns the runtime parameters of the LLM , which are a combination of the RuntimeParameter s of the LLM , the aggregator_llm and the proposers_llms . Returns: Type Description RuntimeParametersNames The runtime parameters of the LLM . "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.model_name","title":"model_name: str property ","text":"Returns the aggregated model name. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.load","title":"load() ","text":"Loads all the LLM s in the MixtureOfAgents . Source code in src/distilabel/models/llms/moa.py def load(self) -> None:\n \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n super().load()\n\n for llm in self.proposers_llms:\n self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\") # type: ignore\n llm.load()\n\n self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\") # type: ignore\n self.aggregator_llm.load()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.get_generation_kwargs","title":"get_generation_kwargs() ","text":"Returns the generation kwargs of the MixtureOfAgents as a dictionary. Returns: Type Description Dict[str, Any] The generation kwargs of the MixtureOfAgents . Source code in src/distilabel/models/llms/moa.py def get_generation_kwargs(self) -> Dict[str, Any]:\n \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n Returns:\n The generation kwargs of the `MixtureOfAgents`.\n \"\"\"\n return {\n \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n \"proposers_llms\": [\n llm.get_generation_kwargs() for llm in self.proposers_llms\n ],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._build_moa_system_prompt","title":"_build_moa_system_prompt(prev_outputs) ","text":"Builds the Mixture-of-Agents system prompt. Parameters: Name Type Description Default prev_outputs List[str] The list of previous outputs to use as references. required Returns: Type Description str The Mixture-of-Agents system prompt. Source code in src/distilabel/models/llms/moa.py def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n \"\"\"Builds the Mixture-of-Agents system prompt.\n\n Args:\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The Mixture-of-Agents system prompt.\n \"\"\"\n moa_system_prompt = MOA_SYSTEM_PROMPT\n for i, prev_output in enumerate(prev_outputs):\n if prev_output is not None:\n moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n return moa_system_prompt\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._inject_moa_system_prompt","title":"_inject_moa_system_prompt(input, prev_outputs) ","text":"Injects the Mixture-of-Agents system prompt into the input. Parameters: Name Type Description Default input StandardInput The input to inject the system prompt into. required prev_outputs List[str] The list of previous outputs to use as references. required Returns: Type Description StandardInput The input with the Mixture-of-Agents system prompt injected. Source code in src/distilabel/models/llms/moa.py def _inject_moa_system_prompt(\n self, input: \"StandardInput\", prev_outputs: List[str]\n) -> \"StandardInput\":\n \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n Args:\n input: The input to inject the system prompt into.\n prev_outputs: The list of previous outputs to use as references.\n\n Returns:\n The input with the Mixture-of-Agents system prompt injected.\n \"\"\"\n if len(prev_outputs) == 0:\n return input\n\n moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n system = next((item for item in input if item[\"role\"] == \"system\"), None)\n if system:\n original_system_prompt = system[\"content\"]\n system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n else:\n input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n return input\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._agenerate","title":"_agenerate(inputs, num_generations=1, **kwargs) async ","text":"Internal function to concurrently generate responses for a list of inputs. Parameters: Name Type Description Default inputs List[FormattedInput] the list of inputs to generate responses for. required num_generations int the number of generations to generate per input. 1 **kwargs Any the additional kwargs to be used for the generation. {} Returns: Type Description List[GenerateOutput] A list containing the generations for each input. Source code in src/distilabel/models/llms/moa.py async def _agenerate(\n self,\n inputs: List[\"FormattedInput\"],\n num_generations: int = 1,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n Args:\n inputs: the list of inputs to generate responses for.\n num_generations: the number of generations to generate per input.\n **kwargs: the additional kwargs to be used for the generation.\n\n Returns:\n A list containing the generations for each input.\n \"\"\"\n aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n \"proposers_llms\", [{}] * len(self.proposers_llms)\n )\n\n prev_outputs = []\n for round in range(self.rounds):\n self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\") # type: ignore\n\n # Generate `num_generations` with each proposer LLM for each input\n tasks = [\n asyncio.create_task(\n llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in itertools.zip_longest(\n inputs, prev_outputs, fillvalue=[]\n )\n ],\n num_generations=1,\n **generation_kwargs,\n )\n )\n for llm, generation_kwargs in zip(\n self.proposers_llms, proposers_llms_kwargs\n )\n ]\n\n # Group generations per input\n outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n prev_outputs = [\n list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n ]\n\n self._logger.debug(\"Aggregating outputs in MoA\") # type: ignore\n if isinstance(self.aggregator_llm, AsyncLLM):\n return await self.aggregator_llm._agenerate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n\n return self.aggregator_llm.generate(\n inputs=[\n self._inject_moa_system_prompt(\n cast(\"StandardInput\", input), prev_input_outputs\n )\n for input, prev_input_outputs in zip(inputs, prev_outputs)\n ],\n num_generations=num_generations,\n **aggregator_llm_kwargs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM","title":"OllamaLLM ","text":" Bases: AsyncLLM Ollama LLM implementation running the Async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"notus\". host Optional[RuntimeParameter[str]] the Ollama server host. timeout RuntimeParameter[int] the timeout for the LLM. Defaults to 120 . _aclient Optional[AsyncClient] the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method. Runtime parameters host : the Ollama server host. timeout : the client timeout for the Ollama API. Defaults to 120 . Examples: Generate text: from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/ollama.py class OllamaLLM(AsyncLLM):\n \"\"\"Ollama LLM implementation running the Async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"notus\".\n host: the Ollama server host.\n timeout: the timeout for the LLM. Defaults to `120`.\n _aclient: the `AsyncClient` to use for the Ollama API. It is meant to be used internally.\n Set in the `load` method.\n\n Runtime parameters:\n - `host`: the Ollama server host.\n - `timeout`: the client timeout for the Ollama API. Defaults to `120`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import OllamaLLM\n\n llm = OllamaLLM(model=\"llama3\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n host: Optional[RuntimeParameter[str]] = Field(\n default=None, description=\"The host of the Ollama API.\"\n )\n timeout: RuntimeParameter[int] = Field(\n default=120, description=\"The timeout for the Ollama API.\"\n )\n follow_redirects: bool = True\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _num_generations_param_supported = False\n\n _aclient: Optional[\"AsyncClient\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n super().load()\n\n try:\n from ollama import AsyncClient\n\n self._aclient = AsyncClient(\n host=self.host,\n timeout=self.timeout,\n follow_redirects=self.follow_redirects,\n )\n except ImportError as e:\n raise ImportError(\n \"Ollama Python client is not installed. Please install it using\"\n \" `pip install ollama`.\"\n ) from e\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: StandardInput,\n format: Literal[\"\", \"json\"] = \"\",\n # TODO: include relevant options from `Options` in `agenerate` method.\n options: Union[Options, None] = None,\n keep_alive: Union[bool, None] = None,\n ) -> GenerateOutput:\n \"\"\"\n Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n Args:\n input: the input to use for the generation.\n format: the format to use for the generation. Defaults to `\"\"`.\n options: the options to use for the generation. Defaults to `None`.\n keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n Returns:\n A list of strings as completion for the given input.\n \"\"\"\n text = None\n try:\n completion: Dict[str, Any] = await self._aclient.chat( # type: ignore\n model=self.model,\n messages=input, # type: ignore\n stream=False,\n format=format,\n options=options,\n keep_alive=keep_alive,\n )\n text = completion[\"message\"][\"content\"]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n\n return prepare_output([text], **self._get_llm_statistics(completion))\n\n @staticmethod\n def _get_llm_statistics(completion: Dict[str, Any]) -> \"LLMStatistics\":\n return {\n \"input_tokens\": [completion[\"prompt_eval_count\"]],\n \"output_tokens\": [completion[\"eval_count\"]],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.load","title":"load() ","text":"Loads the AsyncClient to use Ollama async API. Source code in src/distilabel/models/llms/ollama.py def load(self) -> None:\n \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n super().load()\n\n try:\n from ollama import AsyncClient\n\n self._aclient = AsyncClient(\n host=self.host,\n timeout=self.timeout,\n follow_redirects=self.follow_redirects,\n )\n except ImportError as e:\n raise ImportError(\n \"Ollama Python client is not installed. Please install it using\"\n \" `pip install ollama`.\"\n ) from e\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.agenerate","title":"agenerate(input, format='', options=None, keep_alive=None) async ","text":"Generates a response asynchronously, using the Ollama Async API definition. Parameters: Name Type Description Default input StandardInput the input to use for the generation. required format Literal['', 'json'] the format to use for the generation. Defaults to \"\" . '' options Union[Options, None] the options to use for the generation. Defaults to None . None keep_alive Union[bool, None] whether to keep the connection alive. Defaults to None . None Returns: Type Description GenerateOutput A list of strings as completion for the given input. Source code in src/distilabel/models/llms/ollama.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: StandardInput,\n format: Literal[\"\", \"json\"] = \"\",\n # TODO: include relevant options from `Options` in `agenerate` method.\n options: Union[Options, None] = None,\n keep_alive: Union[bool, None] = None,\n) -> GenerateOutput:\n \"\"\"\n Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n Args:\n input: the input to use for the generation.\n format: the format to use for the generation. Defaults to `\"\"`.\n options: the options to use for the generation. Defaults to `None`.\n keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n Returns:\n A list of strings as completion for the given input.\n \"\"\"\n text = None\n try:\n completion: Dict[str, Any] = await self._aclient.chat( # type: ignore\n model=self.model,\n messages=input, # type: ignore\n stream=False,\n format=format,\n options=options,\n keep_alive=keep_alive,\n )\n text = completion[\"message\"][\"content\"]\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n f\" Finish reason was: {e}\"\n )\n\n return prepare_output([text], **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM","title":"OpenAILLM ","text":" Bases: AsyncLLM OpenAI LLM implementation running the async API client. Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here. base_url Optional[RuntimeParameter[str]] the base URL to use for the OpenAI API requests. Defaults to None , which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. max_retries RuntimeParameter[int] the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout RuntimeParameter[int] the maximum time in seconds to wait for a response from the API. Defaults to 120 . structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]] a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . Runtime parameters base_url : the base URL to use for the OpenAI API requests. Defaults to None . api_key : the API key to authenticate the requests to the OpenAI API. Defaults to None . max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . Icon :simple-openai: Examples: Generate text: from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate text from a custom endpoint following the OpenAI API: from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Generate with Batch API (offline batch generation): from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n Source code in src/distilabel/models/llms/openai.py class OpenAILLM(AsyncLLM):\n \"\"\"OpenAI LLM implementation running the async API client.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc.\n Supported models can be found [here](https://platform.openai.com/docs/guides/text-generation).\n base_url: the base URL to use for the OpenAI API requests. Defaults to `None`, which\n means that the value set for the environment variable `OPENAI_BASE_URL` will\n be used, or \"https://api.openai.com/v1\" if not set.\n api_key: the API key to authenticate the requests to the OpenAI API. Defaults to\n `None` which means that the value set for the environment variable `OPENAI_API_KEY`\n will be used, or `None` if not set.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n structured_output: a dictionary containing the structured output configuration configuration\n using `instructor`. You can take a look at the dictionary structure in\n `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n Runtime parameters:\n - `base_url`: the base URL to use for the OpenAI API requests. Defaults to `None`.\n - `api_key`: the API key to authenticate the requests to the OpenAI API. Defaults\n to `None`.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n\n Icon:\n `:simple-openai:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n llm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate text from a custom endpoint following the OpenAI API:\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n llm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pydantic import BaseModel\n from distilabel.models.llms import OpenAILLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n\n Generate with Batch API (offline batch generation):\n\n ```python\n from distilabel.models.llms import OpenAILLM\n\n load = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n )\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n # [['Hello! How can I assist you today?']]\n ```\n \"\"\"\n\n model: str\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"OPENAI_BASE_URL\", \"https://api.openai.com/v1\"\n ),\n description=\"The base URL to use for the OpenAI API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_OPENAI_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the OpenAI API.\",\n )\n max_retries: RuntimeParameter[int] = Field(\n default=6,\n description=\"The maximum number of times to retry the request to the API before\"\n \" failing.\",\n )\n timeout: RuntimeParameter[int] = Field(\n default=120,\n description=\"The maximum time in seconds to wait for a response from the API.\",\n )\n structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n )\n\n _api_key_env_var: str = PrivateAttr(_OPENAI_API_KEY_ENV_VAR_NAME)\n _client: \"OpenAI\" = PrivateAttr(None)\n _aclient: \"AsyncOpenAI\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from openai import AsyncOpenAI, OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n self._aclient = AsyncOpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n\n def unload(self) -> None:\n \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n in case an exception is raised and has to be handled in the main process\"\"\"\n\n self._client = None # type: ignore\n self._aclient = None # type: ignore\n self.structured_output = None\n super().unload()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[Dict[str, str]] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to None\n which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n Note:\n If response_format\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n }\n # Check if it's a vision generation task, in that case \"stop\" cannot be used or raises\n # an error in the API.\n if isinstance(\n [row for row in input if row[\"role\"] == \"user\"][0][\"content\"], list\n ):\n kwargs.pop(\"stop\")\n\n if response_format is not None:\n kwargs[\"response_format\"] = response_format\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n\n if structured_output:\n # NOTE: `instructor` doesn't work with `n` parameter, so it will always return\n # only 1 choice.\n statistics = self._get_llm_statistics(completion._raw_response)\n if choice_logprobs := self._get_logprobs_from_choice(\n completion._raw_response.choices[0]\n ):\n output_logprobs = [choice_logprobs]\n else:\n output_logprobs = None\n return prepare_output(\n generations=[completion.model_dump_json()],\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=output_logprobs,\n )\n\n return self._generations_from_openai_completion(completion)\n\n def _generations_from_openai_completion(\n self, completion: \"OpenAIChatCompletion\"\n ) -> \"GenerateOutput\":\n \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n Args:\n completion: the completion object to get the generations from.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n generations = []\n logprobs = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using OpenAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n logprobs.append(choice_logprobs)\n\n statistics = self._get_llm_statistics(completion)\n return prepare_output(\n generations=generations,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=logprobs,\n )\n\n def _get_logprobs_from_choice(\n self, choice: \"OpenAIChoice\"\n ) -> Union[List[List[\"Logprob\"]], None]:\n if choice.logprobs is None or choice.logprobs.content is None:\n return None\n\n return [\n [\n {\"token\": top_logprob.token, \"logprob\": top_logprob.logprob}\n for top_logprob in token_logprobs.top_logprobs\n ]\n for token_logprobs in choice.logprobs.content\n ]\n\n def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[str] = None,\n **kwargs: Any,\n ) -> List[\"GenerateOutput\"]:\n \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input\n in `inputs`.\n\n Raises:\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n ValueError: if no job IDs were found to retrieve the results from.\n \"\"\"\n if self.jobs_ids:\n return self._check_and_get_batch_results()\n\n if inputs:\n self.jobs_ids = self._create_jobs(\n inputs=inputs,\n **{\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n \"response_format\": response_format,\n },\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n\n def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n Batch API.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n Raises:\n ValueError: if no job IDs were found to retrieve the results from.\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n RuntimeError: if the only batch job found failed.\n \"\"\"\n if not self.jobs_ids:\n raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n outputs = []\n for batch_id in self.jobs_ids:\n batch = self._get_openai_batch(batch_id)\n\n if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n self._logger.error( # type: ignore\n f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n )\n if len(self.jobs_ids) == 1:\n self.jobs_ids = None\n raise RuntimeError(\n f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n f\" failed with status '{batch.status}'.\"\n )\n\n continue\n\n outputs.extend(self._retrieve_batch_results(batch))\n\n # sort by `custom_id` to return the results in the same order as the inputs\n outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n return [self._parse_output(output) for output in outputs]\n\n def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n Args:\n output: the output to parse.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n if \"response\" not in output:\n return []\n\n if output[\"response\"][\"status_code\"] != 200:\n return []\n\n return self._generations_from_openai_completion(\n OpenAIChatCompletion(**output[\"response\"][\"body\"])\n )\n\n def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n \"\"\"Gets a batch from the OpenAI Batch API.\n\n Args:\n batch_id: the ID of the batch to retrieve.\n\n Returns:\n The batch retrieved from the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while retrieving the batch from the\n OpenAI Batch API.\n \"\"\"\n import openai\n\n try:\n return self._client.batches.retrieve(batch_id)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n )\n raise e\n\n def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n into a list of dictionaries.\n\n Args:\n batch: the batch to retrieve the results from.\n\n Returns:\n A list of dictionaries containing the results of the batch.\n\n Raises:\n AssertionError: if no output file ID was found in the batch.\n \"\"\"\n import openai\n\n assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n try:\n file_response = self._client.files.content(batch.output_file_id)\n return [orjson.loads(line) for line in file_response.text.splitlines()]\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n )\n return []\n\n def _create_jobs(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> Tuple[str, ...]:\n \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A list of job IDs created in the OpenAI Batch API.\n \"\"\"\n batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n jobs = []\n for batch_input_file in batch_input_files:\n if batch := self._create_batch_api_job(batch_input_file):\n jobs.append(batch.id)\n return tuple(jobs)\n\n def _create_batch_api_job(\n self, batch_input_file: \"OpenAIFileObject\"\n ) -> Union[\"OpenAIBatch\", None]:\n \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n file.\n\n Args:\n batch_input_file: the input file to generate responses for.\n\n Returns:\n The batch job created in the OpenAI Batch API.\n \"\"\"\n import openai\n\n metadata = {\"description\": \"distilabel\"}\n\n if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n batch = None\n try:\n batch = self._client.batches.create(\n completion_window=\"24h\",\n endpoint=\"/v1/chat/completions\",\n input_file_id=batch_input_file.id,\n metadata=metadata,\n )\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI Batch API job for file with ID\"\n f\" '{batch_input_file.id}': {e}.\"\n )\n raise e\n return batch\n\n def _create_batch_files(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> List[\"OpenAIFileObject\"]:\n \"\"\"Creates the necessary input files for the batch API to generate responses. The\n maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n need to split the inputs into multiple files if necessary.\n\n More information: https://platform.openai.com/docs/api-reference/files/create\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n The list of file objects created for the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while creating the batch input file\n in the OpenAI Batch API.\n \"\"\"\n import openai\n\n files = []\n for file_no, buffer in enumerate(\n self._create_jsonl_buffers(inputs=inputs, **kwargs)\n ):\n try:\n # TODO: add distilabel pipeline name and id\n batch_input_file = self._client.files.create(\n file=(self._name_for_openai_files(file_no), buffer),\n purpose=\"batch\",\n )\n files.append(batch_input_file)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI batch input file: {e}\"\n )\n raise e\n return files\n\n def _create_jsonl_buffers(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n ) -> Generator[io.BytesIO, None, None]:\n \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Yields:\n A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n API.\n \"\"\"\n buffer = io.BytesIO()\n buffer_current_size = 0\n for i, input in enumerate(inputs):\n # We create the smallest `custom_id` so we don't increase the size of the file\n # to much, but we can still sort the results with the order of the inputs.\n row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n row_size = len(row)\n if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n buffer.seek(0)\n yield buffer\n buffer = io.BytesIO()\n buffer_current_size = 0\n buffer.write(row)\n buffer_current_size += row_size\n\n if buffer_current_size > 0:\n buffer.seek(0)\n yield buffer\n\n def _create_jsonl_row(\n self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n ) -> bytes:\n \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n Args:\n input: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n custom_id: a custom ID to use for the row.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A JSONL formatted row to be used by the OpenAI Batch API.\n \"\"\"\n # TODO: depending on the format of the input, add `response_format` to the kwargs\n row = {\n \"custom_id\": custom_id,\n \"method\": \"POST\",\n \"url\": \"/v1/chat/completions\",\n \"body\": {\"messages\": input, **kwargs},\n }\n json_row = orjson.dumps(row)\n return json_row + b\"\\n\"\n\n def _name_for_openai_files(self, file_no: int) -> str:\n if (\n envs.DISTILABEL_PIPELINE_NAME is None\n or envs.DISTILABEL_PIPELINE_CACHE_ID is None\n ):\n return f\"distilabel-pipeline-fileno-{file_no}.jsonl\"\n\n return f\"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl\"\n\n @staticmethod\n def _get_llm_statistics(\n completion: Union[\"OpenAIChatCompletion\", \"OpenAICompletion\"],\n ) -> \"LLMStatistics\":\n return {\n \"output_tokens\": [\n completion.usage.completion_tokens if completion.usage else 0\n ],\n \"input_tokens\": [completion.usage.prompt_tokens if completion.usage else 0],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.load","title":"load() ","text":"Loads the AsyncOpenAI client to benefit from async requests. Source code in src/distilabel/models/llms/openai.py def load(self) -> None:\n \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from openai import AsyncOpenAI, OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n if self.api_key is None:\n raise ValueError(\n f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n )\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n self._aclient = AsyncOpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(),\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n if self.structured_output:\n result = self._prepare_structured_output(\n structured_output=self.structured_output,\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n if structured_output := result.get(\"structured_output\"):\n self.structured_output = structured_output\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.unload","title":"unload() ","text":"Set clients to None as they both contain thread._RLock which cannot be pickled in case an exception is raised and has to be handled in the main process Source code in src/distilabel/models/llms/openai.py def unload(self) -> None:\n \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n in case an exception is raised and has to be handled in the main process\"\"\"\n\n self._client = None # type: ignore\n self._aclient = None # type: ignore\n self.structured_output = None\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, logprobs=False, top_logprobs=None, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None) async ","text":"Generates num_generations responses for the given input using the OpenAI async client. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 logprobs bool whether to return the log probabilities or not. Defaults to False . False top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. Defaults to None . None frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, List[str]]] a string or a list of strings to use as a stop sequence for the generation. Defaults to None . None response_format Optional[Dict[str, str]] the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to None which returns text. To return JSON, use {\"type\": \"json_object\"}. None Note If response_format Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/openai.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[Dict[str, str]] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n client.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to None\n which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n Note:\n If response_format\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n structured_output = None\n if isinstance(input, tuple):\n input, structured_output = input\n result = self._prepare_structured_output(\n structured_output=structured_output, # type: ignore\n client=self._aclient,\n framework=\"openai\",\n )\n self._aclient = result.get(\"client\") # type: ignore\n\n if structured_output is None and self.structured_output is not None:\n structured_output = self.structured_output\n\n kwargs = {\n \"messages\": input, # type: ignore\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n }\n # Check if it's a vision generation task, in that case \"stop\" cannot be used or raises\n # an error in the API.\n if isinstance(\n [row for row in input if row[\"role\"] == \"user\"][0][\"content\"], list\n ):\n kwargs.pop(\"stop\")\n\n if response_format is not None:\n kwargs[\"response_format\"] = response_format\n\n if structured_output:\n kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore\n\n completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore\n\n if structured_output:\n # NOTE: `instructor` doesn't work with `n` parameter, so it will always return\n # only 1 choice.\n statistics = self._get_llm_statistics(completion._raw_response)\n if choice_logprobs := self._get_logprobs_from_choice(\n completion._raw_response.choices[0]\n ):\n output_logprobs = [choice_logprobs]\n else:\n output_logprobs = None\n return prepare_output(\n generations=[completion.model_dump_json()],\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=output_logprobs,\n )\n\n return self._generations_from_openai_completion(completion)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_openai_completion","title":"_generations_from_openai_completion(completion) ","text":"Get the generations from the OpenAI Chat Completion object. Parameters: Name Type Description Default completion ChatCompletion the completion object to get the generations from. required Returns: Type Description GenerateOutput A list of strings containing the generated responses for the input. Source code in src/distilabel/models/llms/openai.py def _generations_from_openai_completion(\n self, completion: \"OpenAIChatCompletion\"\n) -> \"GenerateOutput\":\n \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n Args:\n completion: the completion object to get the generations from.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n generations = []\n logprobs = []\n for choice in completion.choices:\n if (content := choice.message.content) is None:\n self._logger.warning( # type: ignore\n f\"Received no response using OpenAI client (model: '{self.model}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(content)\n if choice_logprobs := self._get_logprobs_from_choice(choice):\n logprobs.append(choice_logprobs)\n\n statistics = self._get_llm_statistics(completion)\n return prepare_output(\n generations=generations,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=logprobs,\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, max_new_tokens=128, logprobs=False, top_logprobs=None, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None, **kwargs) ","text":"Uses the OpenAI batch API to generate num_generations responses for the given inputs. Parameters: Name Type Description Default inputs Union[List[FormattedInput], None] a list of inputs in chat format to generate responses for. None num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 logprobs bool whether to return the log probabilities or not. Defaults to False . False top_logprobs Optional[PositiveInt] the number of top log probabilities to return per output token generated. Defaults to None . None frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 stop Optional[Union[str, List[str]]] a string or a list of strings to use as a stop sequence for the generation. Defaults to None . None response_format Optional[str] the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to text . None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input List[GenerateOutput] in inputs . Raises: Type Description DistilabelOfflineBatchGenerationNotFinishedException if the batch generation is not finished yet. ValueError if no job IDs were found to retrieve the results from. Source code in src/distilabel/models/llms/openai.py def offline_batch_generate(\n self,\n inputs: Union[List[\"FormattedInput\"], None] = None,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n logprobs: bool = False,\n top_logprobs: Optional[PositiveInt] = None,\n frequency_penalty: float = 0.0,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n stop: Optional[Union[str, List[str]]] = None,\n response_format: Optional[str] = None,\n **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n logprobs: whether to return the log probabilities or not. Defaults to `False`.\n top_logprobs: the number of top log probabilities to return per output token\n generated. Defaults to `None`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n stop: a string or a list of strings to use as a stop sequence for the generation.\n Defaults to `None`.\n response_format: the format of the response to return. Must be one of\n \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input\n in `inputs`.\n\n Raises:\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n ValueError: if no job IDs were found to retrieve the results from.\n \"\"\"\n if self.jobs_ids:\n return self._check_and_get_batch_results()\n\n if inputs:\n self.jobs_ids = self._create_jobs(\n inputs=inputs,\n **{\n \"model\": self.model,\n \"logprobs\": logprobs,\n \"top_logprobs\": top_logprobs,\n \"max_tokens\": max_new_tokens,\n \"n\": num_generations,\n \"frequency_penalty\": frequency_penalty,\n \"presence_penalty\": presence_penalty,\n \"temperature\": temperature,\n \"top_p\": top_p,\n \"stop\": stop,\n \"response_format\": response_format,\n },\n )\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._check_and_get_batch_results","title":"_check_and_get_batch_results() ","text":"Checks the status of the batch jobs and retrieves the results from the OpenAI Batch API. Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Raises: Type Description ValueError if no job IDs were found to retrieve the results from. DistilabelOfflineBatchGenerationNotFinishedException if the batch generation is not finished yet. RuntimeError if the only batch job found failed. Source code in src/distilabel/models/llms/openai.py def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n Batch API.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n\n Raises:\n ValueError: if no job IDs were found to retrieve the results from.\n DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n is not finished yet.\n RuntimeError: if the only batch job found failed.\n \"\"\"\n if not self.jobs_ids:\n raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n outputs = []\n for batch_id in self.jobs_ids:\n batch = self._get_openai_batch(batch_id)\n\n if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n raise DistilabelOfflineBatchGenerationNotFinishedException(\n jobs_ids=self.jobs_ids\n )\n\n if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n self._logger.error( # type: ignore\n f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n )\n if len(self.jobs_ids) == 1:\n self.jobs_ids = None\n raise RuntimeError(\n f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n f\" failed with status '{batch.status}'.\"\n )\n\n continue\n\n outputs.extend(self._retrieve_batch_results(batch))\n\n # sort by `custom_id` to return the results in the same order as the inputs\n outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n return [self._parse_output(output) for output in outputs]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._parse_output","title":"_parse_output(output) ","text":"Parses the output from the OpenAI Batch API into a list of strings. Parameters: Name Type Description Default output Dict[str, Any] the output to parse. required Returns: Type Description GenerateOutput A list of strings containing the generated responses for the input. Source code in src/distilabel/models/llms/openai.py def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n Args:\n output: the output to parse.\n\n Returns:\n A list of strings containing the generated responses for the input.\n \"\"\"\n from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n if \"response\" not in output:\n return []\n\n if output[\"response\"][\"status_code\"] != 200:\n return []\n\n return self._generations_from_openai_completion(\n OpenAIChatCompletion(**output[\"response\"][\"body\"])\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._get_openai_batch","title":"_get_openai_batch(batch_id) ","text":"Gets a batch from the OpenAI Batch API. Parameters: Name Type Description Default batch_id str the ID of the batch to retrieve. required Returns: Type Description Batch The batch retrieved from the OpenAI Batch API. Raises: Type Description OpenAIError if there was an error while retrieving the batch from the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n \"\"\"Gets a batch from the OpenAI Batch API.\n\n Args:\n batch_id: the ID of the batch to retrieve.\n\n Returns:\n The batch retrieved from the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while retrieving the batch from the\n OpenAI Batch API.\n \"\"\"\n import openai\n\n try:\n return self._client.batches.retrieve(batch_id)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n )\n raise e\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._retrieve_batch_results","title":"_retrieve_batch_results(batch) ","text":"Retrieves the results of a batch from its output file, parsing the JSONL content into a list of dictionaries. Parameters: Name Type Description Default batch Batch the batch to retrieve the results from. required Returns: Type Description List[Dict[str, Any]] A list of dictionaries containing the results of the batch. Raises: Type Description AssertionError if no output file ID was found in the batch. Source code in src/distilabel/models/llms/openai.py def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n into a list of dictionaries.\n\n Args:\n batch: the batch to retrieve the results from.\n\n Returns:\n A list of dictionaries containing the results of the batch.\n\n Raises:\n AssertionError: if no output file ID was found in the batch.\n \"\"\"\n import openai\n\n assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n try:\n file_response = self._client.files.content(batch.output_file_id)\n return [orjson.loads(line) for line in file_response.text.splitlines()]\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n )\n return []\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jobs","title":"_create_jobs(inputs, **kwargs) ","text":"Creates jobs in the OpenAI Batch API to generate responses for the given inputs. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description Tuple[str, ...] A list of job IDs created in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_jobs(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Tuple[str, ...]:\n \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A list of job IDs created in the OpenAI Batch API.\n \"\"\"\n batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n jobs = []\n for batch_input_file in batch_input_files:\n if batch := self._create_batch_api_job(batch_input_file):\n jobs.append(batch.id)\n return tuple(jobs)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_api_job","title":"_create_batch_api_job(batch_input_file) ","text":"Creates a job in the OpenAI Batch API to generate responses for the given input file. Parameters: Name Type Description Default batch_input_file FileObject the input file to generate responses for. required Returns: Type Description Union[Batch, None] The batch job created in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_batch_api_job(\n self, batch_input_file: \"OpenAIFileObject\"\n) -> Union[\"OpenAIBatch\", None]:\n \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n file.\n\n Args:\n batch_input_file: the input file to generate responses for.\n\n Returns:\n The batch job created in the OpenAI Batch API.\n \"\"\"\n import openai\n\n metadata = {\"description\": \"distilabel\"}\n\n if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n batch = None\n try:\n batch = self._client.batches.create(\n completion_window=\"24h\",\n endpoint=\"/v1/chat/completions\",\n input_file_id=batch_input_file.id,\n metadata=metadata,\n )\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI Batch API job for file with ID\"\n f\" '{batch_input_file.id}': {e}.\"\n )\n raise e\n return batch\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_files","title":"_create_batch_files(inputs, **kwargs) ","text":"Creates the necessary input files for the batch API to generate responses. The maximum size of each file so the OpenAI Batch API can process it is 100MB, so we need to split the inputs into multiple files if necessary. More information: https://platform.openai.com/docs/api-reference/files/create Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for, optionally including structured output. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description List[FileObject] The list of file objects created for the OpenAI Batch API. Raises: Type Description OpenAIError if there was an error while creating the batch input file in the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_batch_files(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> List[\"OpenAIFileObject\"]:\n \"\"\"Creates the necessary input files for the batch API to generate responses. The\n maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n need to split the inputs into multiple files if necessary.\n\n More information: https://platform.openai.com/docs/api-reference/files/create\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n The list of file objects created for the OpenAI Batch API.\n\n Raises:\n openai.OpenAIError: if there was an error while creating the batch input file\n in the OpenAI Batch API.\n \"\"\"\n import openai\n\n files = []\n for file_no, buffer in enumerate(\n self._create_jsonl_buffers(inputs=inputs, **kwargs)\n ):\n try:\n # TODO: add distilabel pipeline name and id\n batch_input_file = self._client.files.create(\n file=(self._name_for_openai_files(file_no), buffer),\n purpose=\"batch\",\n )\n files.append(batch_input_file)\n except openai.OpenAIError as e:\n self._logger.error( # type: ignore\n f\"Error while creating OpenAI batch input file: {e}\"\n )\n raise e\n return files\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_buffers","title":"_create_jsonl_buffers(inputs, **kwargs) ","text":"Creates a generator of buffers containing the JSONL formatted inputs to be used by the OpenAI Batch API. The buffers created are of size 100MB or less. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for, optionally including structured output. required kwargs Any the keyword arguments to use for the generation. {} Yields: Type Description BytesIO A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch BytesIO API. Source code in src/distilabel/models/llms/openai.py def _create_jsonl_buffers(\n self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Generator[io.BytesIO, None, None]:\n \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n kwargs: the keyword arguments to use for the generation.\n\n Yields:\n A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n API.\n \"\"\"\n buffer = io.BytesIO()\n buffer_current_size = 0\n for i, input in enumerate(inputs):\n # We create the smallest `custom_id` so we don't increase the size of the file\n # to much, but we can still sort the results with the order of the inputs.\n row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n row_size = len(row)\n if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n buffer.seek(0)\n yield buffer\n buffer = io.BytesIO()\n buffer_current_size = 0\n buffer.write(row)\n buffer_current_size += row_size\n\n if buffer_current_size > 0:\n buffer.seek(0)\n yield buffer\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_row","title":"_create_jsonl_row(input, custom_id, **kwargs) ","text":"Creates a JSONL formatted row to be used by the OpenAI Batch API. Parameters: Name Type Description Default input FormattedInput a list of inputs in chat format to generate responses for, optionally including structured output. required custom_id str a custom ID to use for the row. required kwargs Any the keyword arguments to use for the generation. {} Returns: Type Description bytes A JSONL formatted row to be used by the OpenAI Batch API. Source code in src/distilabel/models/llms/openai.py def _create_jsonl_row(\n self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n) -> bytes:\n \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n Args:\n input: a list of inputs in chat format to generate responses for, optionally\n including structured output.\n custom_id: a custom ID to use for the row.\n kwargs: the keyword arguments to use for the generation.\n\n Returns:\n A JSONL formatted row to be used by the OpenAI Batch API.\n \"\"\"\n # TODO: depending on the format of the input, add `response_format` to the kwargs\n row = {\n \"custom_id\": custom_id,\n \"method\": \"POST\",\n \"url\": \"/v1/chat/completions\",\n \"body\": {\"messages\": input, **kwargs},\n }\n json_row = orjson.dumps(row)\n return json_row + b\"\\n\"\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TogetherLLM","title":"TogetherLLM ","text":" Bases: OpenAILLM TogetherLLM LLM implementation running the async API client of OpenAI. Attributes: Name Type Description model the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here. base_url Optional[RuntimeParameter[str]] the base URL to use for the Together API can be set with TOGETHER_BASE_URL . Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set. api_key Optional[RuntimeParameter[SecretStr]] the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set. _api_key_env_var str the name of the environment variable to use for the API key. It is meant to be used internally. Examples: Generate text: from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/together.py class TogetherLLM(OpenAILLM):\n \"\"\"TogetherLLM LLM implementation running the async API client of OpenAI.\n\n Attributes:\n model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\".\n Supported models can be found [here](https://api.together.xyz/models).\n base_url: the base URL to use for the Together API can be set with `TOGETHER_BASE_URL`.\n Defaults to `None` which means that the value set for the environment variable\n `TOGETHER_BASE_URL` will be used, or \"https://api.together.xyz/v1\" if not set.\n api_key: the API key to authenticate the requests to the Together API. Defaults to `None`\n which means that the value set for the environment variable `TOGETHER_API_KEY` will be\n used, or `None` if not set.\n _api_key_env_var: the name of the environment variable to use for the API key. It\n is meant to be used internally.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import AnyscaleLLM\n\n llm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\n llm.load()\n\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n base_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(\n \"TOGETHER_BASE_URL\", \"https://api.together.xyz/v1\"\n ),\n description=\"The base URL to use for the Together API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_TOGETHER_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Together API.\",\n )\n\n _api_key_env_var: str = PrivateAttr(_TOGETHER_API_KEY_ENV_VAR_NAME)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM","title":"VertexAILLM ","text":" Bases: AsyncLLM VertexAI LLM implementation running the async API clients for Gemini. - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini
To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods: - Setting
GOOGLE_CLOUD_CREDENTIALS environment variable - Using
gcloud auth application-default login command - Using
vertexai.init function from the google-cloud-aiplatform library Attributes: Name Type Description model str the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models. _aclient Optional[GenerativeModel] the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method. Icon :simple-googlecloud: Examples: Generate text: from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Source code in src/distilabel/models/llms/vertexai.py class VertexAILLM(AsyncLLM):\n \"\"\"VertexAI LLM implementation running the async API clients for Gemini.\n\n - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini\n\n To use the `VertexAILLM` is necessary to have configured the Google Cloud authentication\n using one of these methods:\n\n - Setting `GOOGLE_CLOUD_CREDENTIALS` environment variable\n - Using `gcloud auth application-default login` command\n - Using `vertexai.init` function from the `google-cloud-aiplatform` library\n\n Attributes:\n model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". [Supported models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models).\n _aclient: the `GenerativeModel` to use for the Vertex AI Gemini API. It is meant\n to be used internally. Set in the `load` method.\n\n Icon:\n `:simple-googlecloud:`\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import VertexAILLM\n\n llm = VertexAILLM(model=\"gemini-1.5-pro\")\n\n llm.load()\n\n # Call the model\n output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n \"\"\"\n\n model: str\n\n _num_generations_param_supported = False\n\n _aclient: Optional[\"GenerativeModel\"] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n self._generation_config_class = GenerationConfig\n except ImportError as e:\n raise ImportError(\n \"vertexai is not installed. Please install it using\"\n \" `pip install google-cloud-aiplatform`.\"\n ) from e\n\n if _is_gemini_model(self.model):\n self._aclient = GenerativeModel(model_name=self.model)\n else:\n raise NotImplementedError(\n \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n )\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n \"\"\"Converts a chat type to a list of content items expected by the API.\n\n Args:\n input: the chat type to be converted.\n\n Returns:\n List[str]: a list of content items expected by the API.\n \"\"\"\n from vertexai.generative_models import Content, Part\n\n contents = []\n for message in input:\n if message[\"role\"] not in [\"user\", \"model\"]:\n raise ValueError(\n \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n )\n contents.append(\n Content(\n role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n )\n )\n return contents\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: VertexChatType,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n max_output_tokens: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n safety_settings: Optional[Dict[str, Any]] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n top_k: If specified, top-k sampling will be used. Defaults to `None`.\n max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n stop_sequences: A list of stop sequences. Defaults to `None`.\n safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vertexai.generative_models import GenerationConfig\n\n content: \"GenerationResponse\" = await self._aclient.generate_content_async( # type: ignore\n contents=self._chattype_to_content(input),\n generation_config=GenerationConfig(\n candidate_count=1, # only one candidate allowed per call\n temperature=temperature,\n top_k=top_k,\n top_p=top_p,\n max_output_tokens=max_output_tokens,\n stop_sequences=stop_sequences,\n ),\n safety_settings=safety_settings, # type: ignore\n tools=tools, # type: ignore\n stream=False,\n )\n\n text = None\n try:\n text = content.candidates[0].text\n except ValueError:\n self._logger.warning( # type: ignore\n f\"Received no response using VertexAI client (model: '{self.model}').\"\n f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n )\n return prepare_output([text], **self._get_llm_statistics(content))\n\n @staticmethod\n def _get_llm_statistics(content: \"GenerationResponse\") -> \"LLMStatistics\":\n return {\n \"input_tokens\": [content.usage_metadata.prompt_token_count],\n \"output_tokens\": [content.usage_metadata.candidates_token_count],\n }\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.load","title":"load() ","text":"Loads the GenerativeModel class which has access to generate_content_async to benefit from async requests. Source code in src/distilabel/models/llms/vertexai.py def load(self) -> None:\n \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n super().load()\n\n try:\n from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n self._generation_config_class = GenerationConfig\n except ImportError as e:\n raise ImportError(\n \"vertexai is not installed. Please install it using\"\n \" `pip install google-cloud-aiplatform`.\"\n ) from e\n\n if _is_gemini_model(self.model):\n self._aclient = GenerativeModel(model_name=self.model)\n else:\n raise NotImplementedError(\n \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM._chattype_to_content","title":"_chattype_to_content(input) ","text":"Converts a chat type to a list of content items expected by the API. Parameters: Name Type Description Default input StandardInput the chat type to be converted. required Returns: Type Description List[Content] List[str]: a list of content items expected by the API. Source code in src/distilabel/models/llms/vertexai.py def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n \"\"\"Converts a chat type to a list of content items expected by the API.\n\n Args:\n input: the chat type to be converted.\n\n Returns:\n List[str]: a list of content items expected by the API.\n \"\"\"\n from vertexai.generative_models import Content, Part\n\n contents = []\n for message in input:\n if message[\"role\"] not in [\"user\", \"model\"]:\n raise ValueError(\n \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n )\n contents.append(\n Content(\n role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n )\n )\n return contents\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.agenerate","title":"agenerate(input, temperature=None, top_p=None, top_k=None, max_output_tokens=None, stop_sequences=None, safety_settings=None, tools=None) async ","text":"Generates num_generations responses for the given input using the VertexAI async client definition. Parameters: Name Type Description Default input VertexChatType a single input in chat format to generate responses for. required temperature Optional[float] Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to None . None top_p Optional[float] If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to None . None top_k Optional[int] If specified, top-k sampling will be used. Defaults to None . None max_output_tokens Optional[int] The maximum number of output tokens to generate per message. Defaults to None . None stop_sequences Optional[List[str]] A list of stop sequences. Defaults to None . None safety_settings Optional[Dict[str, Any]] Safety configuration for returned content from the API. Defaults to None . None tools Optional[List[Dict[str, Any]]] A potential list of tools that can be used by the API. Defaults to None . None Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vertexai.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: VertexChatType,\n temperature: Optional[float] = None,\n top_p: Optional[float] = None,\n top_k: Optional[int] = None,\n max_output_tokens: Optional[int] = None,\n stop_sequences: Optional[List[str]] = None,\n safety_settings: Optional[Dict[str, Any]] = None,\n tools: Optional[List[Dict[str, Any]]] = None,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n Args:\n input: a single input in chat format to generate responses for.\n temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n top_k: If specified, top-k sampling will be used. Defaults to `None`.\n max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n stop_sequences: A list of stop sequences. Defaults to `None`.\n safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vertexai.generative_models import GenerationConfig\n\n content: \"GenerationResponse\" = await self._aclient.generate_content_async( # type: ignore\n contents=self._chattype_to_content(input),\n generation_config=GenerationConfig(\n candidate_count=1, # only one candidate allowed per call\n temperature=temperature,\n top_k=top_k,\n top_p=top_p,\n max_output_tokens=max_output_tokens,\n stop_sequences=stop_sequences,\n ),\n safety_settings=safety_settings, # type: ignore\n tools=tools, # type: ignore\n stream=False,\n )\n\n text = None\n try:\n text = content.candidates[0].text\n except ValueError:\n self._logger.warning( # type: ignore\n f\"Received no response using VertexAI client (model: '{self.model}').\"\n f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n )\n return prepare_output([text], **self._get_llm_statistics(content))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM","title":"ClientvLLM ","text":" Bases: OpenAILLM , MagpieChatTemplateMixin A client for the vLLM server implementing the OpenAI API specification. Attributes: Name Type Description base_url the base URL of the vLLM server. Defaults to \"http://localhost:8000\" . max_retries the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout the maximum time in seconds to wait for a response from the API. Defaults to 120 . httpx_client_kwargs extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . tokenizer Optional[str] the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None . tokenizer_revision Optional[str] the revision of the tokenizer to load. Defaults to None . _aclient Optional[str] the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None . Runtime parameters base_url : the base url of the vLLM server. Defaults to \"http://localhost:8000\" . max_retries : the maximum number of times to retry the request to the API before failing. Defaults to 6 . timeout : the maximum time in seconds to wait for a response from the API. Defaults to 120 . httpx_client_kwargs : extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . Examples: Generate text: from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n)\n# [\n# [\n# \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n# \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n# \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n# ]\n# ]\n Source code in src/distilabel/models/llms/vllm.py class ClientvLLM(OpenAILLM, MagpieChatTemplateMixin):\n \"\"\"A client for the `vLLM` server implementing the OpenAI API specification.\n\n Attributes:\n base_url: the base URL of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n max_retries: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n timeout: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n httpx_client_kwargs: extra kwargs that will be passed to the `httpx.AsyncClient`\n created to comunicate with the `vLLM` server. Defaults to `None`.\n tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used\n to apply the chat template and tokenize the inputs before sending it to the\n server. Defaults to `None`.\n tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n _aclient: the `httpx.AsyncClient` used to comunicate with the `vLLM` server. Defaults\n to `None`.\n\n Runtime parameters:\n - `base_url`: the base url of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n - `max_retries`: the maximum number of times to retry the request to the API before\n failing. Defaults to `6`.\n - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n to `120`.\n - `httpx_client_kwargs`: extra kwargs that will be passed to the `httpx.AsyncClient`\n created to comunicate with the `vLLM` server. Defaults to `None`.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import ClientvLLM\n\n llm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n )\n\n llm.load()\n\n results = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n )\n # [\n # [\n # \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n # \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n # \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n # ]\n # ]\n ```\n \"\"\"\n\n model: str = \"\" # Default value so it's not needed to `ClientvLLM(model=\"...\")`\n tokenizer: Optional[str] = None\n tokenizer_revision: Optional[str] = None\n\n # We need the sync client to get the list of models\n _client: \"OpenAI\" = PrivateAttr(None)\n _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n optionally.\"\"\"\n\n self.api_key = SecretStr(\"EMPTY\")\n\n # We need to first create the sync client to get the model name that will be used\n # in the `super().load()` when creating the logger.\n try:\n from openai import OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n super().load()\n\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"To use `ClientvLLM` you need to install `transformers`.\"\n \"Please install it using `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.tokenizer, revision=self.tokenizer_revision\n )\n\n @cached_property\n def model_name(self) -> str: # type: ignore\n \"\"\"Returns the name of the model served with vLLM server.\"\"\"\n models = self._client.models.list()\n return models.data[0].id\n\n def _prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n @validate_call\n async def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n logit_bias: Optional[Dict[str, int]] = None,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n ) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n Defaults to ``\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: nucleus sampling. The value refers to the top-p tokens that should be\n considered for sampling. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n completion = await self._aclient.completions.create(\n model=self.model_name,\n prompt=self._prepare_input(input), # type: ignore\n n=num_generations,\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n )\n\n generations = []\n for choice in completion.choices:\n text = choice.text\n if text == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response from vLLM server (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(text)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.model_name","title":"model_name: str cached property ","text":"Returns the name of the model served with vLLM server. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.load","title":"load() ","text":"Creates an httpx.AsyncClient to connect to the vLLM server and a tokenizer optionally. Source code in src/distilabel/models/llms/vllm.py def load(self) -> None:\n \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n optionally.\"\"\"\n\n self.api_key = SecretStr(\"EMPTY\")\n\n # We need to first create the sync client to get the model name that will be used\n # in the `super().load()` when creating the logger.\n try:\n from openai import OpenAI\n except ImportError as ie:\n raise ImportError(\n \"OpenAI Python client is not installed. Please install it using\"\n \" `pip install openai`.\"\n ) from ie\n\n self._client = OpenAI(\n base_url=self.base_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n max_retries=self.max_retries, # type: ignore\n timeout=self.timeout,\n )\n\n super().load()\n\n try:\n from transformers import AutoTokenizer\n except ImportError as ie:\n raise ImportError(\n \"To use `ClientvLLM` you need to install `transformers`.\"\n \"Please install it using `pip install transformers`.\"\n ) from ie\n\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.tokenizer, revision=self.tokenizer_revision\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM._prepare_input","title":"_prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/vllm.py def _prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n prompt: str = (\n self._tokenizer.apply_chat_template( # type: ignore\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, logit_bias=None, presence_penalty=0.0, temperature=1.0, top_p=1.0) async ","text":"Generates num_generations responses for each input. Parameters: Name Type Description Default input FormattedInput a single input in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 logit_bias Optional[Dict[str, int]] modify the likelihood of specified tokens appearing in the completion. Defaults to `` None presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float nucleus sampling. The value refers to the top-p tokens that should be considered for sampling. Defaults to 1.0 . 1.0 Returns: Type Description GenerateOutput A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vllm.py @validate_call\nasync def agenerate( # type: ignore\n self,\n input: FormattedInput,\n num_generations: int = 1,\n max_new_tokens: int = 128,\n frequency_penalty: float = 0.0,\n logit_bias: Optional[Dict[str, int]] = None,\n presence_penalty: float = 0.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n) -> GenerateOutput:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n input: a single input in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n logit_bias: modify the likelihood of specified tokens appearing in the completion.\n Defaults to ``\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: nucleus sampling. The value refers to the top-p tokens that should be\n considered for sampling. Defaults to `1.0`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n\n completion = await self._aclient.completions.create(\n model=self.model_name,\n prompt=self._prepare_input(input), # type: ignore\n n=num_generations,\n max_tokens=max_new_tokens,\n frequency_penalty=frequency_penalty,\n logit_bias=logit_bias,\n presence_penalty=presence_penalty,\n temperature=temperature,\n top_p=top_p,\n )\n\n generations = []\n for choice in completion.choices:\n text = choice.text\n if text == \"\":\n self._logger.warning( # type: ignore\n f\"Received no response from vLLM server (model: '{self.model_name}').\"\n f\" Finish reason was: {choice.finish_reason}\"\n )\n generations.append(text)\n\n return prepare_output(generations, **self._get_llm_statistics(completion))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM","title":"vLLM ","text":" Bases: LLM , MagpieChatTemplateMixin , CudaDevicePlacementMixin vLLM library LLM implementation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. dtype str the data type to use for the model. Defaults to auto . trust_remote_code bool whether to trust the remote code when loading the model. Defaults to False . quantization Optional[str] the quantization mode to use for the model. Defaults to None . revision Optional[str] the revision of the model to load. Defaults to None . tokenizer Optional[str] the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None . tokenizer_mode Literal['auto', 'slow'] the mode to use for the tokenizer. Defaults to auto . tokenizer_revision Optional[str] the revision of the tokenizer to load. Defaults to None . skip_tokenizer_init bool whether to skip the initialization of the tokenizer. Defaults to False . chat_template Optional[str] a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]] a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. seed int the seed to use for the random number generator. Defaults to 0 . extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]] additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . _model LLM the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. _tokenizer PreTrainedTokenizer the tokenizer instance used to format the prompt before passing it to the LLM . This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. use_magpie_template PreTrainedTokenizer a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . magpie_pre_query_template PreTrainedTokenizer the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . References - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
Runtime parameters extra_kwargs : additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Examples: Generate text: from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n Generate structured data: from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n Source code in src/distilabel/models/llms/vllm.py class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n \"\"\"`vLLM` library LLM implementation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n dtype: the data type to use for the model. Defaults to `auto`.\n trust_remote_code: whether to trust the remote code when loading the model. Defaults\n to `False`.\n quantization: the quantization mode to use for the model. Defaults to `None`.\n revision: the revision of the model to load. Defaults to `None`.\n tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n the tokenizer files. If not provided, the tokenizer will be loaded from the\n model directory. Defaults to `None`.\n tokenizer_mode: the mode to use for the tokenizer. Defaults to `auto`.\n tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults\n to `False`.\n chat_template: a chat template that will be used to build the prompts before\n sending them to the model. If not provided, the chat template defined in the\n tokenizer config will be used. If not provided and the tokenizer doesn't have\n a chat template, then ChatML template will be used. Defaults to `None`.\n structured_output: a dictionary containing the structured output configuration or if more\n fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n seed: the seed to use for the random number generator. Defaults to `0`.\n extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n `LLM` class of `vllm` library. Defaults to `{}`.\n _model: the `vLLM` model instance. This attribute is meant to be used internally\n and should not be accessed directly. It will be set in the `load` method.\n _tokenizer: the tokenizer instance used to format the prompt before passing it to\n the `LLM`. This attribute is meant to be used internally and should not be\n accessed directly. It will be set in the `load` method.\n use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n template. Defaults to `False`.\n magpie_pre_query_template: the pre-query template to be applied to the prompt or\n sent to the LLM to generate an instruction or a follow up user message. Valid\n values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n to `None`.\n\n References:\n - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\n\n Runtime parameters:\n - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to\n the `LLM` class of `vllm` library.\n\n Examples:\n Generate text:\n\n ```python\n from distilabel.models.llms import vLLM\n\n # You can pass a custom chat_template to the model\n llm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n ```\n\n Generate structured data:\n\n ```python\n from pathlib import Path\n from distilabel.models.llms import vLLM\n\n class User(BaseModel):\n name: str\n last_name: str\n id: int\n\n llm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n\n llm.load()\n\n # Call the model\n output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n ```\n \"\"\"\n\n model: str\n dtype: str = \"auto\"\n trust_remote_code: bool = False\n quantization: Optional[str] = None\n revision: Optional[str] = None\n\n tokenizer: Optional[str] = None\n tokenizer_mode: Literal[\"auto\", \"slow\"] = \"auto\"\n tokenizer_revision: Optional[str] = None\n skip_tokenizer_init: bool = False\n chat_template: Optional[str] = None\n\n seed: int = 0\n\n extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n default_factory=dict,\n description=\"Additional dictionary of keyword arguments that will be passed to the\"\n \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n )\n structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n default=None,\n description=\"The structured output format to use across all the generations.\",\n )\n\n _model: \"_vLLM\" = PrivateAttr(None)\n _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n _structured_output_logits_processor: Optional[Callable] = PrivateAttr(default=None)\n\n def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n default value is ChatML format, unless explicitly provided.\n \"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n tokenizer=self.tokenizer,\n tokenizer_mode=self.tokenizer_mode,\n tokenizer_revision=self.tokenizer_revision,\n skip_tokenizer_init=self.skip_tokenizer_init,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n self._tokenizer = self._model.get_tokenizer() # type: ignore\n if self.chat_template is not None:\n self._tokenizer.chat_template = self.chat_template # type: ignore\n\n if self.structured_output:\n self._structured_output_logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n\n def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n self._cleanup_vllm_model()\n self._model = None # type: ignore\n self._tokenizer = None # type: ignore\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n\n def _cleanup_vllm_model(self) -> None:\n if self._model is None:\n return\n\n import torch # noqa\n from vllm.distributed.parallel_state import (\n destroy_distributed_environment,\n destroy_model_parallel,\n )\n\n destroy_model_parallel()\n destroy_distributed_environment()\n del self._model.llm_engine.model_executor\n del self._model\n with contextlib.suppress(AssertionError):\n torch.distributed.destroy_process_group()\n gc.collect()\n if torch.cuda.is_available():\n torch.cuda.empty_cache()\n torch.cuda.synchronize()\n\n @property\n def model_name(self) -> str:\n \"\"\"Returns the model name used for the LLM.\"\"\"\n return self.model\n\n def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._tokenizer.chat_template is None:\n return [item[\"content\"] for item in input if item[\"role\"] == \"user\"][0]\n\n prompt: str = (\n self._tokenizer.apply_chat_template(\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n\n def _prepare_batches(\n self, inputs: List[\"StructuredInput\"]\n ) -> Tuple[List[Tuple[List[str], \"OutlinesStructuredOutputType\"]], List[int]]:\n \"\"\"Prepares the inputs by grouping them by the structured output.\n\n When we generate structured outputs with schemas obtained from a dataset, we need to\n prepare the data to try to send batches of inputs instead of single inputs to the model\n to take advante of the engine. So we group the inputs by the structured output to be\n passed in the `generate` method.\n\n Args:\n inputs: The batch of inputs passed to the generate method. As we expect to be generating\n structured outputs, each element will be a tuple containing the instruction and the\n structured output.\n\n Returns:\n The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n Each new tuple will contain instead of the single instruction, a list of instructions\n \"\"\"\n instruction_order = {}\n batches: Dict[str, List[str]] = {}\n for i, (instruction, structured_output) in enumerate(inputs):\n instruction = self.prepare_input(instruction)\n instruction_order[instruction] = i\n\n structured_output = json.dumps(structured_output)\n if structured_output not in batches:\n batches[structured_output] = [instruction]\n else:\n batches[structured_output].append(instruction)\n\n # Built a list with instructions sorted by structured output\n flat_instructions = [\n instruction for _, group in batches.items() for instruction in group\n ]\n\n # Generate the list of indices based on the original order\n sorted_indices = [\n instruction_order[instruction] for instruction in flat_instructions\n ]\n\n return [\n (batch, json.loads(schema)) for schema, batch in batches.items()\n ], sorted_indices\n\n @validate_call\n def generate( # noqa: C901 # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n presence_penalty: float = 0.0,\n frequency_penalty: float = 0.0,\n repetition_penalty: float = 1.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n top_k: int = -1,\n min_p: float = 0.0,\n logprobs: Optional[PositiveInt] = None,\n stop: Optional[List[str]] = None,\n stop_token_ids: Optional[List[int]] = None,\n include_stop_str_in_output: bool = False,\n logits_processors: Optional[LogitsProcessors] = None,\n extra_sampling_params: Optional[Dict[str, Any]] = None,\n ) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n repetition_penalty: the repetition penalty to use for the generation Defaults to\n `1.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n logprobs: number of log probabilities to return per output token. If `None`,\n then no log probability won't be returned. Defaults to `None`.\n stop: a list of strings that will be used to stop the generation when found.\n Defaults to `None`.\n stop_token_ids: a list of token ids that will be used to stop the generation\n when found. Defaults to `None`.\n include_stop_str_in_output: whether to include the stop string in the output.\n Defaults to `False`.\n logits_processors: a list of functions to process the logits before sampling.\n Defaults to `None`.\n extra_sampling_params: dictionary with additional arguments to be passed to\n the `SamplingParams` class from `vllm`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vllm import SamplingParams\n\n if not logits_processors:\n logits_processors = []\n\n if extra_sampling_params is None:\n extra_sampling_params = {}\n\n structured_output = None\n\n if isinstance(inputs[0], tuple):\n # Prepare the batches for structured generation\n prepared_batches, sorted_indices = self._prepare_batches(inputs) # type: ignore\n else:\n # Simulate a batch without the structured output content\n prepared_batches = [([self.prepare_input(input) for input in inputs], None)] # type: ignore\n sorted_indices = None\n\n # Case in which we have a single structured output for the dataset\n if self._structured_output_logits_processor:\n logits_processors.append(self._structured_output_logits_processor)\n\n batched_outputs: List[\"LLMOutput\"] = []\n generations = []\n\n for prepared_inputs, structured_output in prepared_batches:\n if self.structured_output is not None and structured_output is not None:\n # TODO: warning\n pass\n\n if structured_output is not None:\n logits_processors.append(\n self._prepare_structured_output(structured_output) # type: ignore\n )\n\n sampling_params = SamplingParams( # type: ignore\n n=num_generations,\n presence_penalty=presence_penalty,\n frequency_penalty=frequency_penalty,\n repetition_penalty=repetition_penalty,\n temperature=temperature,\n top_p=top_p,\n top_k=top_k,\n min_p=min_p,\n max_tokens=max_new_tokens,\n logprobs=logprobs,\n stop=stop,\n stop_token_ids=stop_token_ids,\n include_stop_str_in_output=include_stop_str_in_output,\n logits_processors=logits_processors,\n **extra_sampling_params,\n )\n\n batch_outputs: List[\"RequestOutput\"] = self._model.generate(\n prompts=prepared_inputs,\n sampling_params=sampling_params,\n use_tqdm=False,\n )\n\n # Remove structured output logit processor to avoid stacking structured output\n # logits processors that leads to non-sense generations\n if structured_output is not None:\n logits_processors.pop(-1)\n\n for input, outputs in zip(prepared_inputs, batch_outputs):\n texts, statistics, outputs_logprobs = self._process_outputs(\n input, outputs\n )\n batched_outputs.append(texts)\n generations.append(\n prepare_output(\n generations=texts,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=outputs_logprobs,\n )\n )\n\n if sorted_indices is not None:\n pairs = list(enumerate(sorted_indices))\n pairs.sort(key=lambda x: x[1])\n generations = [generations[original_idx] for original_idx, _ in pairs]\n\n return generations\n\n def _process_outputs(\n self, input: str, outputs: \"RequestOutput\"\n ) -> Tuple[\"LLMOutput\", \"LLMStatistics\", \"LLMLogprobs\"]:\n texts = []\n outputs_logprobs = []\n statistics = {\n \"input_tokens\": [compute_tokens(input, self._tokenizer.encode)]\n * len(outputs.outputs),\n \"output_tokens\": [],\n }\n for output in outputs.outputs:\n texts.append(output.text)\n statistics[\"output_tokens\"].append(len(output.token_ids))\n if output.logprobs is not None:\n outputs_logprobs.append(self._get_llm_logprobs(output))\n return texts, statistics, outputs_logprobs\n\n def _prepare_structured_output(\n self, structured_output: \"OutlinesStructuredOutputType\"\n ) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n assert structured_output is not None, \"`structured_output` cannot be `None`\"\n\n result = prepare_guided_output(structured_output, \"vllm\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n\n def _get_llm_logprobs(self, output: \"CompletionOutput\") -> List[List[\"Logprob\"]]:\n logprobs = []\n for token_logprob in output.logprobs: # type: ignore\n token_logprobs = []\n for logprob in token_logprob.values():\n token_logprobs.append(\n {\"token\": logprob.decoded_token, \"logprob\": logprob.logprob}\n )\n logprobs.append(token_logprobs)\n return logprobs\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.model_name","title":"model_name: str property ","text":"Returns the model name used for the LLM. "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.load","title":"load() ","text":"Loads the vLLM model using either the path or the Hugging Face Hub repository id. Additionally, this method also sets the chat_template for the tokenizer, so as to properly parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the default value is ChatML format, unless explicitly provided. Source code in src/distilabel/models/llms/vllm.py def load(self) -> None:\n \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n default value is ChatML format, unless explicitly provided.\n \"\"\"\n super().load()\n\n CudaDevicePlacementMixin.load(self)\n\n try:\n from vllm import LLM as _vLLM\n except ImportError as ie:\n raise ImportError(\n \"vLLM is not installed. Please install it using `pip install vllm`.\"\n ) from ie\n\n self._model = _vLLM(\n self.model,\n dtype=self.dtype,\n trust_remote_code=self.trust_remote_code,\n quantization=self.quantization,\n revision=self.revision,\n tokenizer=self.tokenizer,\n tokenizer_mode=self.tokenizer_mode,\n tokenizer_revision=self.tokenizer_revision,\n skip_tokenizer_init=self.skip_tokenizer_init,\n seed=self.seed,\n **self.extra_kwargs, # type: ignore\n )\n\n self._tokenizer = self._model.get_tokenizer() # type: ignore\n if self.chat_template is not None:\n self._tokenizer.chat_template = self.chat_template # type: ignore\n\n if self.structured_output:\n self._structured_output_logits_processor = self._prepare_structured_output(\n self.structured_output\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.unload","title":"unload() ","text":"Unloads the vLLM model. Source code in src/distilabel/models/llms/vllm.py def unload(self) -> None:\n \"\"\"Unloads the `vLLM` model.\"\"\"\n self._cleanup_vllm_model()\n self._model = None # type: ignore\n self._tokenizer = None # type: ignore\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.prepare_input","title":"prepare_input(input) ","text":"Prepares the input (applying the chat template and tokenization) for the provided input. Parameters: Name Type Description Default input StandardInput the input list containing chat items. required Returns: Type Description str The prompt to send to the LLM. Source code in src/distilabel/models/llms/vllm.py def prepare_input(self, input: \"StandardInput\") -> str:\n \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n input.\n\n Args:\n input: the input list containing chat items.\n\n Returns:\n The prompt to send to the LLM.\n \"\"\"\n if self._tokenizer.chat_template is None:\n return [item[\"content\"] for item in input if item[\"role\"] == \"user\"][0]\n\n prompt: str = (\n self._tokenizer.apply_chat_template(\n input, # type: ignore\n tokenize=False,\n add_generation_prompt=True, # type: ignore\n )\n if input\n else \"\"\n )\n return super().apply_magpie_pre_query_template(prompt, input)\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_batches","title":"_prepare_batches(inputs) ","text":"Prepares the inputs by grouping them by the structured output. When we generate structured outputs with schemas obtained from a dataset, we need to prepare the data to try to send batches of inputs instead of single inputs to the model to take advante of the engine. So we group the inputs by the structured output to be passed in the generate method. Parameters: Name Type Description Default inputs List[StructuredInput] The batch of inputs passed to the generate method. As we expect to be generating structured outputs, each element will be a tuple containing the instruction and the structured output. required Returns: Type Description List[Tuple[List[str], OutlinesStructuredOutputType]] The prepared batches (sub-batches let's say) to be passed to the generate method. List[int] Each new tuple will contain instead of the single instruction, a list of instructions Source code in src/distilabel/models/llms/vllm.py def _prepare_batches(\n self, inputs: List[\"StructuredInput\"]\n) -> Tuple[List[Tuple[List[str], \"OutlinesStructuredOutputType\"]], List[int]]:\n \"\"\"Prepares the inputs by grouping them by the structured output.\n\n When we generate structured outputs with schemas obtained from a dataset, we need to\n prepare the data to try to send batches of inputs instead of single inputs to the model\n to take advante of the engine. So we group the inputs by the structured output to be\n passed in the `generate` method.\n\n Args:\n inputs: The batch of inputs passed to the generate method. As we expect to be generating\n structured outputs, each element will be a tuple containing the instruction and the\n structured output.\n\n Returns:\n The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n Each new tuple will contain instead of the single instruction, a list of instructions\n \"\"\"\n instruction_order = {}\n batches: Dict[str, List[str]] = {}\n for i, (instruction, structured_output) in enumerate(inputs):\n instruction = self.prepare_input(instruction)\n instruction_order[instruction] = i\n\n structured_output = json.dumps(structured_output)\n if structured_output not in batches:\n batches[structured_output] = [instruction]\n else:\n batches[structured_output].append(instruction)\n\n # Built a list with instructions sorted by structured output\n flat_instructions = [\n instruction for _, group in batches.items() for instruction in group\n ]\n\n # Generate the list of indices based on the original order\n sorted_indices = [\n instruction_order[instruction] for instruction in flat_instructions\n ]\n\n return [\n (batch, json.loads(schema)) for schema, batch in batches.items()\n ], sorted_indices\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, logprobs=None, stop=None, stop_token_ids=None, include_stop_str_in_output=False, logits_processors=None, extra_sampling_params=None) ","text":"Generates num_generations responses for each input. Parameters: Name Type Description Default inputs List[FormattedInput] a list of inputs in chat format to generate responses for. required num_generations int the number of generations to create per input. Defaults to 1 . 1 max_new_tokens int the maximum number of new tokens that the model will generate. Defaults to 128 . 128 presence_penalty float the presence penalty to use for the generation. Defaults to 0.0 . 0.0 frequency_penalty float the repetition penalty to use for the generation. Defaults to 0.0 . 0.0 repetition_penalty float the repetition penalty to use for the generation Defaults to 1.0 . 1.0 temperature float the temperature to use for the generation. Defaults to 0.1 . 1.0 top_p float the top-p value to use for the generation. Defaults to 1.0 . 1.0 top_k int the top-k value to use for the generation. Defaults to 0 . -1 min_p float the minimum probability to use for the generation. Defaults to 0.0 . 0.0 logprobs Optional[PositiveInt] number of log probabilities to return per output token. If None , then no log probability won't be returned. Defaults to None . None stop Optional[List[str]] a list of strings that will be used to stop the generation when found. Defaults to None . None stop_token_ids Optional[List[int]] a list of token ids that will be used to stop the generation when found. Defaults to None . None include_stop_str_in_output bool whether to include the stop string in the output. Defaults to False . False logits_processors Optional[LogitsProcessors] a list of functions to process the logits before sampling. Defaults to None . None extra_sampling_params Optional[Dict[str, Any]] dictionary with additional arguments to be passed to the SamplingParams class from vllm . None Returns: Type Description List[GenerateOutput] A list of lists of strings containing the generated responses for each input. Source code in src/distilabel/models/llms/vllm.py @validate_call\ndef generate( # noqa: C901 # type: ignore\n self,\n inputs: List[FormattedInput],\n num_generations: int = 1,\n max_new_tokens: int = 128,\n presence_penalty: float = 0.0,\n frequency_penalty: float = 0.0,\n repetition_penalty: float = 1.0,\n temperature: float = 1.0,\n top_p: float = 1.0,\n top_k: int = -1,\n min_p: float = 0.0,\n logprobs: Optional[PositiveInt] = None,\n stop: Optional[List[str]] = None,\n stop_token_ids: Optional[List[int]] = None,\n include_stop_str_in_output: bool = False,\n logits_processors: Optional[LogitsProcessors] = None,\n extra_sampling_params: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n \"\"\"Generates `num_generations` responses for each input.\n\n Args:\n inputs: a list of inputs in chat format to generate responses for.\n num_generations: the number of generations to create per input. Defaults to\n `1`.\n max_new_tokens: the maximum number of new tokens that the model will generate.\n Defaults to `128`.\n presence_penalty: the presence penalty to use for the generation. Defaults to\n `0.0`.\n frequency_penalty: the repetition penalty to use for the generation. Defaults\n to `0.0`.\n repetition_penalty: the repetition penalty to use for the generation Defaults to\n `1.0`.\n temperature: the temperature to use for the generation. Defaults to `0.1`.\n top_p: the top-p value to use for the generation. Defaults to `1.0`.\n top_k: the top-k value to use for the generation. Defaults to `0`.\n min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n logprobs: number of log probabilities to return per output token. If `None`,\n then no log probability won't be returned. Defaults to `None`.\n stop: a list of strings that will be used to stop the generation when found.\n Defaults to `None`.\n stop_token_ids: a list of token ids that will be used to stop the generation\n when found. Defaults to `None`.\n include_stop_str_in_output: whether to include the stop string in the output.\n Defaults to `False`.\n logits_processors: a list of functions to process the logits before sampling.\n Defaults to `None`.\n extra_sampling_params: dictionary with additional arguments to be passed to\n the `SamplingParams` class from `vllm`.\n\n Returns:\n A list of lists of strings containing the generated responses for each input.\n \"\"\"\n from vllm import SamplingParams\n\n if not logits_processors:\n logits_processors = []\n\n if extra_sampling_params is None:\n extra_sampling_params = {}\n\n structured_output = None\n\n if isinstance(inputs[0], tuple):\n # Prepare the batches for structured generation\n prepared_batches, sorted_indices = self._prepare_batches(inputs) # type: ignore\n else:\n # Simulate a batch without the structured output content\n prepared_batches = [([self.prepare_input(input) for input in inputs], None)] # type: ignore\n sorted_indices = None\n\n # Case in which we have a single structured output for the dataset\n if self._structured_output_logits_processor:\n logits_processors.append(self._structured_output_logits_processor)\n\n batched_outputs: List[\"LLMOutput\"] = []\n generations = []\n\n for prepared_inputs, structured_output in prepared_batches:\n if self.structured_output is not None and structured_output is not None:\n # TODO: warning\n pass\n\n if structured_output is not None:\n logits_processors.append(\n self._prepare_structured_output(structured_output) # type: ignore\n )\n\n sampling_params = SamplingParams( # type: ignore\n n=num_generations,\n presence_penalty=presence_penalty,\n frequency_penalty=frequency_penalty,\n repetition_penalty=repetition_penalty,\n temperature=temperature,\n top_p=top_p,\n top_k=top_k,\n min_p=min_p,\n max_tokens=max_new_tokens,\n logprobs=logprobs,\n stop=stop,\n stop_token_ids=stop_token_ids,\n include_stop_str_in_output=include_stop_str_in_output,\n logits_processors=logits_processors,\n **extra_sampling_params,\n )\n\n batch_outputs: List[\"RequestOutput\"] = self._model.generate(\n prompts=prepared_inputs,\n sampling_params=sampling_params,\n use_tqdm=False,\n )\n\n # Remove structured output logit processor to avoid stacking structured output\n # logits processors that leads to non-sense generations\n if structured_output is not None:\n logits_processors.pop(-1)\n\n for input, outputs in zip(prepared_inputs, batch_outputs):\n texts, statistics, outputs_logprobs = self._process_outputs(\n input, outputs\n )\n batched_outputs.append(texts)\n generations.append(\n prepare_output(\n generations=texts,\n input_tokens=statistics[\"input_tokens\"],\n output_tokens=statistics[\"output_tokens\"],\n logprobs=outputs_logprobs,\n )\n )\n\n if sorted_indices is not None:\n pairs = list(enumerate(sorted_indices))\n pairs.sort(key=lambda x: x[1])\n generations = [generations[original_idx] for original_idx, _ in pairs]\n\n return generations\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output) ","text":"Creates the appropriate function to filter tokens to generate structured outputs. Parameters: Name Type Description Default structured_output OutlinesStructuredOutputType the configuration dict to prepare the structured output. required Returns: Type Description Union[Callable, None] The callable that will be used to guide the generation of the model. Source code in src/distilabel/models/llms/vllm.py def _prepare_structured_output(\n self, structured_output: \"OutlinesStructuredOutputType\"\n) -> Union[Callable, None]:\n \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n Args:\n structured_output: the configuration dict to prepare the structured output.\n\n Returns:\n The callable that will be used to guide the generation of the model.\n \"\"\"\n from distilabel.steps.tasks.structured_outputs.outlines import (\n prepare_guided_output,\n )\n\n assert structured_output is not None, \"`structured_output` cannot be `None`\"\n\n result = prepare_guided_output(structured_output, \"vllm\", self._model)\n if (schema := result.get(\"schema\")) and self.structured_output:\n self.structured_output[\"schema\"] = schema\n return result[\"processor\"]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin","title":"CudaDevicePlacementMixin ","text":" Bases: BaseModel Mixin class to assign CUDA devices to the LLM based on the cuda_devices attribute and the device placement information provided in _device_llm_placement_map . Providing the device placement information is optional, but if it is provided, it will be used to assign CUDA devices to the LLM s, trying to avoid using the same device for different LLM s. Attributes: Name Type Description cuda_devices RuntimeParameter[Union[List[int], Literal['auto']]] a list with the ID of the CUDA devices to be used by the LLM . If set to \"auto\", the devices will be automatically assigned based on the device placement information provided in _device_llm_placement_map . If set to a list of devices, it will be checked if the devices are available to be used by the LLM . If not, a warning will be logged. disable_cuda_device_placement RuntimeParameter[bool] Whether to disable the CUDA device placement logic or not. Defaults to False . _llm_identifier Union[str, None] the identifier of the LLM to be used as key in _device_llm_placement_map . _device_llm_placement_map Generator[Dict[str, List[int]], None, None] a dictionary with the device placement information for each LLM . Source code in src/distilabel/models/mixins/cuda_device_placement.py class CudaDevicePlacementMixin(BaseModel):\n \"\"\"Mixin class to assign CUDA devices to the `LLM` based on the `cuda_devices` attribute\n and the device placement information provided in `_device_llm_placement_map`. Providing\n the device placement information is optional, but if it is provided, it will be used to\n assign CUDA devices to the `LLM`s, trying to avoid using the same device for different\n `LLM`s.\n\n Attributes:\n cuda_devices: a list with the ID of the CUDA devices to be used by the `LLM`. If set\n to \"auto\", the devices will be automatically assigned based on the device\n placement information provided in `_device_llm_placement_map`. If set to a list\n of devices, it will be checked if the devices are available to be used by the\n `LLM`. If not, a warning will be logged.\n disable_cuda_device_placement: Whether to disable the CUDA device placement logic\n or not. Defaults to `False`.\n _llm_identifier: the identifier of the `LLM` to be used as key in `_device_llm_placement_map`.\n _device_llm_placement_map: a dictionary with the device placement information for each\n `LLM`.\n \"\"\"\n\n cuda_devices: RuntimeParameter[Union[List[int], Literal[\"auto\"]]] = Field(\n default=\"auto\", description=\"A list with the ID of the CUDA devices to be used.\"\n )\n disable_cuda_device_placement: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to disable the CUDA device placement logic or not.\",\n )\n\n _llm_identifier: Union[str, None] = PrivateAttr(default=None)\n _desired_num_gpus: PositiveInt = PrivateAttr(default=1)\n _available_cuda_devices: List[int] = PrivateAttr(default_factory=list)\n _can_check_cuda_devices: bool = PrivateAttr(default=False)\n\n _logger: \"Logger\" = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`.\"\"\"\n\n if self.disable_cuda_device_placement:\n return\n\n try:\n import pynvml\n\n pynvml.nvmlInit()\n device_count = pynvml.nvmlDeviceGetCount()\n self._available_cuda_devices = list(range(device_count))\n self._can_check_cuda_devices = True\n except ImportError as ie:\n if self.cuda_devices == \"auto\":\n raise ImportError(\n \"The 'pynvml' library is not installed. It is required to automatically\"\n \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n ) from ie\n\n if self.cuda_devices:\n self._logger.warning( # type: ignore\n \"The 'pynvml' library is not installed. It is recommended to install it\"\n \" to check if the CUDA devices assigned to the LLM are available.\"\n )\n\n self._assign_cuda_devices()\n\n def unload(self) -> None:\n \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n placement information provided in `_device_llm_placement_map`.\"\"\"\n if self.disable_cuda_device_placement:\n return\n\n with self._device_llm_placement_map() as device_map:\n if self._llm_identifier in device_map:\n self._logger.debug( # type: ignore\n f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n )\n del device_map[self._llm_identifier]\n\n @contextmanager\n def _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n \"\"\"Reads the content of the device placement file of the node with a lock, yields\n the content, and writes the content back to the file after the context manager is\n closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n Yields:\n The content of the device placement file.\n \"\"\"\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n with portalocker.Lock(\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n \"r+\",\n flags=portalocker.LockFlags.EXCLUSIVE,\n ) as f:\n try:\n content = json.load(f)\n except json.JSONDecodeError:\n content = {}\n yield content\n f.seek(0)\n f.truncate()\n f.write(json.dumps(content))\n\n def _assign_cuda_devices(self) -> None:\n \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n will be set to the first available CUDA device that is not going to be used by any\n other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n checked if the devices are available to be used by the LLM. If not, a warning will be\n logged.\"\"\"\n\n # Take the lock and read the device placement information for each LLM.\n with self._device_llm_placement_map() as device_map:\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n for _ in range(self._desired_num_gpus):\n if (device_id := self._get_cuda_device(device_map)) is not None:\n self.cuda_devices.append(device_id)\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n if len(self.cuda_devices) != self._desired_num_gpus:\n self._logger.warning( # type: ignore\n f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n f\" for LLM with identifier '{self._llm_identifier}'.\"\n )\n else:\n self._check_cuda_devices(device_map)\n\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n\n # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n\n self._set_cuda_visible_devices()\n\n def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n \"\"\"\n for device in self.cuda_devices: # type: ignore\n for llm, devices in device_map.items():\n if device in devices:\n self._logger.warning( # type: ignore\n f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n f\"'{device}'. This may lead to performance issues or running out\"\n \" of memory depending on the device capabilities and the loaded\"\n \" models.\"\n )\n\n def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n to be used by any other LLM.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n\n Returns:\n The first available CUDA device to be used by the LLM.\n\n Raises:\n RuntimeError: if there is no available CUDA device to be used by the LLM.\n \"\"\"\n for device in self._available_cuda_devices:\n if all(device not in devices for devices in device_map.values()):\n return device\n\n return None\n\n def _set_cuda_visible_devices(self) -> None:\n \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n to be used by the LLM.\n \"\"\"\n if not self.cuda_devices:\n return\n\n if self._can_check_cuda_devices and not all(\n device in self._available_cuda_devices for device in self.cuda_devices\n ):\n raise RuntimeError(\n f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n \" the 'cuda_devices' attribute and try again.\"\n )\n\n cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n self._logger.info( # type: ignore\n f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n f\" {self.cuda_devices}.\"\n )\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.load","title":"load() ","text":"Assign CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map . Source code in src/distilabel/models/mixins/cuda_device_placement.py def load(self) -> None:\n \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`.\"\"\"\n\n if self.disable_cuda_device_placement:\n return\n\n try:\n import pynvml\n\n pynvml.nvmlInit()\n device_count = pynvml.nvmlDeviceGetCount()\n self._available_cuda_devices = list(range(device_count))\n self._can_check_cuda_devices = True\n except ImportError as ie:\n if self.cuda_devices == \"auto\":\n raise ImportError(\n \"The 'pynvml' library is not installed. It is required to automatically\"\n \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n ) from ie\n\n if self.cuda_devices:\n self._logger.warning( # type: ignore\n \"The 'pynvml' library is not installed. It is recommended to install it\"\n \" to check if the CUDA devices assigned to the LLM are available.\"\n )\n\n self._assign_cuda_devices()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.unload","title":"unload() ","text":"Unloads the LLM and removes the CUDA devices assigned to it from the device placement information provided in _device_llm_placement_map . Source code in src/distilabel/models/mixins/cuda_device_placement.py def unload(self) -> None:\n \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n placement information provided in `_device_llm_placement_map`.\"\"\"\n if self.disable_cuda_device_placement:\n return\n\n with self._device_llm_placement_map() as device_map:\n if self._llm_identifier in device_map:\n self._logger.debug( # type: ignore\n f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n )\n del device_map[self._llm_identifier]\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._device_llm_placement_map","title":"_device_llm_placement_map() ","text":"Reads the content of the device placement file of the node with a lock, yields the content, and writes the content back to the file after the context manager is closed. If the file doesn't exist, an empty dictionary will be yielded. Yields: Type Description Dict[str, List[int]] The content of the device placement file. Source code in src/distilabel/models/mixins/cuda_device_placement.py @contextmanager\ndef _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n \"\"\"Reads the content of the device placement file of the node with a lock, yields\n the content, and writes the content back to the file after the context manager is\n closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n Yields:\n The content of the device placement file.\n \"\"\"\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n with portalocker.Lock(\n _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n \"r+\",\n flags=portalocker.LockFlags.EXCLUSIVE,\n ) as f:\n try:\n content = json.load(f)\n except json.JSONDecodeError:\n content = {}\n yield content\n f.seek(0)\n f.truncate()\n f.write(json.dumps(content))\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._assign_cuda_devices","title":"_assign_cuda_devices() ","text":"Assigns CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map . If the cuda_devices attribute is set to \"auto\", it will be set to the first available CUDA device that is not going to be used by any other LLM. If the cuda_devices attribute is set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _assign_cuda_devices(self) -> None:\n \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n will be set to the first available CUDA device that is not going to be used by any\n other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n checked if the devices are available to be used by the LLM. If not, a warning will be\n logged.\"\"\"\n\n # Take the lock and read the device placement information for each LLM.\n with self._device_llm_placement_map() as device_map:\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n for _ in range(self._desired_num_gpus):\n if (device_id := self._get_cuda_device(device_map)) is not None:\n self.cuda_devices.append(device_id)\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n if len(self.cuda_devices) != self._desired_num_gpus:\n self._logger.warning( # type: ignore\n f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n f\" for LLM with identifier '{self._llm_identifier}'.\"\n )\n else:\n self._check_cuda_devices(device_map)\n\n device_map[self._llm_identifier] = self.cuda_devices # type: ignore\n\n # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n if self.cuda_devices == \"auto\":\n self.cuda_devices = []\n\n self._set_cuda_visible_devices()\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._check_cuda_devices","title":"_check_cuda_devices(device_map) ","text":"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs. Parameters: Name Type Description Default device_map Dict[str, List[int]] a dictionary with the device placement information for each LLM. required Source code in src/distilabel/models/mixins/cuda_device_placement.py def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n \"\"\"\n for device in self.cuda_devices: # type: ignore\n for llm, devices in device_map.items():\n if device in devices:\n self._logger.warning( # type: ignore\n f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n f\"'{device}'. This may lead to performance issues or running out\"\n \" of memory depending on the device capabilities and the loaded\"\n \" models.\"\n )\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._get_cuda_device","title":"_get_cuda_device(device_map) ","text":"Returns the first available CUDA device to be used by the LLM that is not going to be used by any other LLM. Parameters: Name Type Description Default device_map Dict[str, List[int]] a dictionary with the device placement information for each LLM. required Returns: Type Description Union[int, None] The first available CUDA device to be used by the LLM. Raises: Type Description RuntimeError if there is no available CUDA device to be used by the LLM. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n to be used by any other LLM.\n\n Args:\n device_map: a dictionary with the device placement information for each LLM.\n\n Returns:\n The first available CUDA device to be used by the LLM.\n\n Raises:\n RuntimeError: if there is no available CUDA device to be used by the LLM.\n \"\"\"\n for device in self._available_cuda_devices:\n if all(device not in devices for devices in device_map.values()):\n return device\n\n return None\n "},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._set_cuda_visible_devices","title":"_set_cuda_visible_devices() ","text":"Sets the CUDA_VISIBLE_DEVICES environment variable to the list of CUDA devices to be used by the LLM. Source code in src/distilabel/models/mixins/cuda_device_placement.py def _set_cuda_visible_devices(self) -> None:\n \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n to be used by the LLM.\n \"\"\"\n if not self.cuda_devices:\n return\n\n if self._can_check_cuda_devices and not all(\n device in self._available_cuda_devices for device in self.cuda_devices\n ):\n raise RuntimeError(\n f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n \" the 'cuda_devices' attribute and try again.\"\n )\n\n cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n self._logger.info( # type: ignore\n f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n f\" {self.cuda_devices}.\"\n )\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n "},{"location":"api/pipeline/","title":"Pipeline","text":"This section contains the API reference for the distilabel pipelines. For an example on how to use the pipelines, see the Tutorial - Pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base","title":"base ","text":""},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline","title":"BasePipeline ","text":" Bases: ABC , RequirementsMixin , _Serializable Base class for a distilabel pipeline. Attributes: Name Type Description name The name of the pipeline. description A description of the pipeline. dag The DAG instance that represents the pipeline. _cache_dir The directory where the pipeline will be cached. _logger The logger instance that will be used by the pipeline. _batch_manager Optional[_BatchManager] The batch manager that will manage the batches received from the steps while running the pipeline. It will be created when the pipeline is run, from scratch or from cache. Defaults to None . _write_buffer Optional[_WriteBuffer] The buffer that will store the data of the leaf steps of the pipeline while running, so the Distiset can be created at the end. It will be created when the pipeline is run. Defaults to None . _fs Optional[AbstractFileSystem] The fsspec filesystem to be used to store the data of the _Batch es passed between the steps. It will be set when the pipeline is run. Defaults to None . _storage_base_path Optional[str] The base path where the data of the _Batch es passed between the steps will be stored. It will be set then the pipeline is run. Defaults to None . _use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . _dry_run A flag to indicate if the pipeline is running in dry run mode. Defaults to False . output_queue A queue to store the output of the steps while running the pipeline. load_queue A queue used by each Step to notify the main process it has finished loading or it the step has been unloaded. Source code in src/distilabel/pipeline/base.py class BasePipeline(ABC, RequirementsMixin, _Serializable):\n \"\"\"Base class for a `distilabel` pipeline.\n\n Attributes:\n name: The name of the pipeline.\n description: A description of the pipeline.\n dag: The `DAG` instance that represents the pipeline.\n _cache_dir: The directory where the pipeline will be cached.\n _logger: The logger instance that will be used by the pipeline.\n _batch_manager: The batch manager that will manage the batches received from the\n steps while running the pipeline. It will be created when the pipeline is run,\n from scratch or from cache. Defaults to `None`.\n _write_buffer: The buffer that will store the data of the leaf steps of the pipeline\n while running, so the `Distiset` can be created at the end. It will be created\n when the pipeline is run. Defaults to `None`.\n _fs: The `fsspec` filesystem to be used to store the data of the `_Batch`es passed\n between the steps. It will be set when the pipeline is run. Defaults to `None`.\n _storage_base_path: The base path where the data of the `_Batch`es passed between\n the steps will be stored. It will be set then the pipeline is run. Defaults\n to `None`.\n _use_fs_to_pass_data: Whether to use the file system to pass the data of the\n `_Batch`es between the steps. Even if this parameter is `False`, the `Batch`es\n received by `GlobalStep`s will always use the file system to pass the data.\n Defaults to `False`.\n _dry_run: A flag to indicate if the pipeline is running in dry run mode. Defaults\n to `False`.\n output_queue: A queue to store the output of the steps while running the pipeline.\n load_queue: A queue used by each `Step` to notify the main process it has finished\n loading or it the step has been unloaded.\n \"\"\"\n\n _output_queue: \"Queue[Any]\"\n _load_queue: \"Queue[Union[StepLoadStatus, None]]\"\n\n def __init__(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n enable_metadata: bool = False,\n requirements: Optional[List[str]] = None,\n ) -> None:\n \"\"\"Initialize the `BasePipeline` instance.\n\n Args:\n name: The name of the pipeline. If not generated, a random one will be generated by default.\n description: A description of the pipeline. Defaults to `None`.\n cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n enable_metadata: Whether to include the distilabel metadata column for the pipeline\n in the final `Distiset`. It contains metadata used by distilabel, for example\n the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n field. Defaults to `False`.\n requirements: List of requirements that must be installed to run the pipeline.\n Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n that this requirements must be installed.\n \"\"\"\n self.name = name or _PIPELINE_DEFAULT_NAME\n self.description = description\n self._enable_metadata = enable_metadata\n self.dag = DAG()\n\n if cache_dir:\n self._cache_dir = Path(cache_dir)\n elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n self._cache_dir = Path(env_cache_dir)\n else:\n self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n self._batch_manager: Optional[\"_BatchManager\"] = None\n self._write_buffer: Optional[\"_WriteBuffer\"] = None\n self._steps_input_queues: Dict[str, \"Queue\"] = {}\n\n self._steps_load_status: Dict[str, int] = {}\n self._steps_load_status_lock = threading.Lock()\n\n self._stop_called = False\n self._stop_called_lock = threading.Lock()\n self._stop_calls = 0\n\n self._recover_offline_batch_generate_for_step: Union[\n Tuple[str, List[List[Dict[str, Any]]]], None\n ] = None\n\n self._fs: Optional[fsspec.AbstractFileSystem] = None\n self._storage_base_path: Optional[str] = None\n self._use_fs_to_pass_data: bool = False\n self._dry_run = False\n\n self._current_stage = 0\n self._stages_last_batch: List[List[str]] = []\n self._load_groups = []\n\n self.requirements = requirements or []\n\n self._exception: Union[Exception, None] = None\n\n self._log_queue: Union[\"Queue[Any]\", None] = None\n\n def __enter__(self) -> Self:\n \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(self)\n return self\n\n def __exit__(self, exc_type, exc_value, traceback) -> None:\n \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(None)\n self._set_pipeline_name()\n\n def _set_pipeline_name(self) -> None:\n \"\"\"Creates a name for the pipeline if it's the default one (if hasn't been set).\"\"\"\n if self.name == _PIPELINE_DEFAULT_NAME:\n self.name = f\"pipeline_{'_'.join(self.dag)}\"\n\n @property\n def signature(self) -> str:\n \"\"\"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.\n\n The main use is to find the pipeline in the cache folder.\n\n Returns:\n Signature of the pipeline.\n \"\"\"\n\n pipeline_dump = self.dump()[\"pipeline\"]\n steps_names = list(self.dag)\n connections_info = [\n f\"{c['from']}-{'-'.join(c['to'])}\" for c in pipeline_dump[\"connections\"]\n ]\n\n routing_batch_functions_info = []\n for function in pipeline_dump[\"routing_batch_functions\"]:\n step = function[\"step\"]\n routing_batch_function: \"RoutingBatchFunction\" = self.dag.get_step(step)[\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n ]\n if type_info := routing_batch_function._get_type_info():\n step += f\"-{type_info}\"\n routing_batch_functions_info.append(step)\n\n return hashlib.sha1(\n \",\".join(\n steps_names + connections_info + routing_batch_functions_info\n ).encode()\n ).hexdigest()\n\n def run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[logging.Handler]] = None,\n ) -> \"Distiset\": # type: ignore\n \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n the pipeline.\n\n This method should be extended by the specific pipeline implementation,\n adding the logic to run the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n \"\"\"\n\n self._exception: Union[Exception, None] = None\n\n # Set the runtime parameters that will be used during the pipeline execution.\n # They are used to generate the signature of the pipeline that is used to hit the\n # cache when the pipeline is run, so it's important to do it first.\n self._set_runtime_parameters(parameters or {})\n\n self._refresh_pipeline_from_cache()\n\n if dataset is not None:\n self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n setup_logging(\n log_queue=self._log_queue,\n filename=str(self._cache_location[\"log_file\"]),\n logging_handlers=logging_handlers,\n )\n\n # Set the name of the pipeline if it's the default one. This should be called\n # if the pipeline is defined within the context manager, and the run is called\n # outside of it. Is here in the following case:\n # with Pipeline() as pipeline:\n # pipeline.run()\n self._set_pipeline_name()\n\n # Validate the pipeline DAG to check that all the steps are chainable, there are\n # no missing runtime parameters, batch sizes are correct, load groups are valid,\n # etc.\n self._load_groups = self._built_load_groups(load_groups)\n self._validate()\n\n self._set_pipeline_artifacts_path_in_steps()\n\n # Set the initial load status for all the steps\n self._init_steps_load_status()\n\n # Load the stages status or initialize it\n self._load_stages_status(use_cache)\n\n # Load the `_BatchManager` from cache or create one from scratch\n self._load_batch_manager(use_cache)\n\n # Check pipeline requirements are installed\n self._check_requirements()\n\n # Setup the filesystem that will be used to pass the data of the `_Batch`es\n self._setup_fsspec(storage_parameters)\n self._use_fs_to_pass_data = use_fs_to_pass_data\n\n if self._dry_run:\n self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n # If the batch manager is not able to generate batches, that means that the loaded\n # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n # the previous pipeline execution was completed successfully.\n if not self._batch_manager.can_generate(): # type: ignore\n self._logger.info(\n \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n \" Returning `Distiset` from cache data...\"\n )\n distiset = create_distiset(\n data_dir=self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n stop_logging()\n return distiset\n\n self._setup_write_buffer(use_cache)\n\n self._print_load_stages_info()\n\n def dry_run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n batch_size: int = 1,\n dataset: Optional[\"InputDataset\"] = None,\n ) -> \"Distiset\":\n \"\"\"Do a dry run to test the pipeline runs as expected.\n\n Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n to the specified `batch_size`, and run just with a single batch, effectively\n running the whole pipeline with a single example. The cache will be set to `False`.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n batch_size: The batch size of the unique batch generated by the generators\n steps of the pipeline. Defaults to `1`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n\n Returns:\n Will return the `Distiset` as the main run method would do.\n \"\"\"\n self._dry_run = True\n\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n if step.is_generator:\n if not parameters:\n parameters = {}\n parameters[step_name] = {\"batch_size\": batch_size}\n\n distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n self._dry_run = False\n return distiset\n\n def get_load_stages(self, load_groups: Optional[\"LoadGroups\"] = None) -> LoadStages:\n \"\"\"Convenient method to get the load stages of a pipeline.\n\n Args:\n load_groups: A list containing list of steps that has to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n Defaults to `None`.\n\n Returns:\n A tuple with the first element containing asorted list by stage containing\n lists with the names of the steps of the stage, and the second element a list\n sorted by stage containing lists with the names of the last steps of the stage.\n \"\"\"\n load_groups = self._built_load_groups(load_groups)\n return self.dag.get_steps_load_stages(load_groups)\n\n def _add_dataset_generator_step(\n self, dataset: \"InputDataset\", batch_size: int = 50\n ) -> None:\n \"\"\"Create a root step to work as the `GeneratorStep` for the pipeline using a\n dataset.\n\n Args:\n dataset: A dataset that will be used to create a `GeneratorStep` and\n placed in the DAG as the root step.\n batch_size: The size of the batches generated by the `GeneratorStep`.\n\n Raises:\n ValueError: If there's already a `GeneratorStep` in the pipeline.\n \"\"\"\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if isinstance(step_name, GeneratorStep):\n raise DistilabelUserError(\n \"There is already a `GeneratorStep` in the pipeline, you can either\"\n \" pass a `dataset` to the run method, or create a `GeneratorStep` explictly.\"\n f\" `GeneratorStep`: {step}\",\n page=\"sections/how_to_guides/basic/step/#types-of-steps\",\n )\n loader = make_generator_step(\n dataset=dataset,\n pipeline=self,\n batch_size=batch_size,\n )\n self.dag.add_root_step(loader)\n\n def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n Returns:\n A dictionary with the step name as the key and a list of dictionaries with\n the parameter name and the parameter info as the value.\n \"\"\"\n runtime_parameters = {}\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n runtime_parameters[step_name] = step.get_runtime_parameters_info()\n return runtime_parameters\n\n def _built_load_groups(\n self, load_groups: Optional[\"LoadGroups\"] = None\n ) -> List[List[str]]:\n if load_groups is None:\n return []\n\n if load_groups == \"sequential_step_execution\":\n return [[step_name] for step_name in self.dag]\n\n return [\n [\n step.name if isinstance(step, _Step) else step\n for step in steps_load_group\n ] # type: ignore\n for steps_load_group in load_groups\n ]\n\n def _validate(self) -> None:\n \"\"\"Validates the pipeline DAG to check that all the steps are chainable, there are\n no missing runtime parameters, batch sizes are correct and that load groups are\n valid (if any).\"\"\"\n self.dag.validate()\n self._validate_load_groups(self._load_groups)\n\n def _validate_load_groups(self, load_groups: List[List[Any]]) -> None: # noqa: C901\n \"\"\"Checks that the provided load groups are valid and that the steps can be scheduled\n to be loaded in different stages without any issue.\n\n Args:\n load_groups: the load groups to be checked.\n\n Raises:\n DistilabelUserError: if something is not OK when checking the load groups.\n \"\"\"\n\n def check_predecessor_in_load_group(\n step_name: str, load_group: List[str], first: bool\n ) -> Union[str, None]:\n if not first and step_name in load_group:\n return step_name\n\n for predecessor_step_name in self.dag.get_step_predecessors(step_name):\n # Immediate predecessor is in the same load group. This is OK.\n if first and predecessor_step_name in load_group:\n continue\n\n # Case: A -> B -> C, load_group=[A, C]\n # If a non-immediate predecessor is in the same load group and an immediate\n # predecessor is not , then it's not OK because we cannot load `step_name`\n # before one immediate predecessor.\n if step_name_in_load_group := check_predecessor_in_load_group(\n predecessor_step_name, load_group, False\n ):\n return step_name_in_load_group\n\n return None\n\n steps_included_in_load_group = []\n for load_group_num, steps_load_group in enumerate(load_groups):\n for step_name in steps_load_group:\n if step_name not in self.dag.G:\n raise DistilabelUserError(\n f\"Step with name '{step_name}' included in group {load_group_num} of\"\n \" the `load_groups` is not an step included in the pipeline. Please,\"\n \" check that you're passing the correct step name and run again.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n node = self.dag.get_step(step_name)\n step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n\n if step_name_in_load_group := check_predecessor_in_load_group(\n step_name, steps_load_group, True\n ):\n # Improve this user error message\n raise DistilabelUserError(\n f\"Step with name '{step_name}' cannot be in the same load group\"\n f\" as the step with name '{step_name_in_load_group}'. '{step_name_in_load_group}'\"\n f\" is not an immediate predecessor of '{step_name}' and there are\"\n \" immediate predecessors that have not been included.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n if step.is_global and len(steps_load_group) > 1:\n raise DistilabelUserError(\n f\"Global step '{step_name}' has been included in a load group along\"\n \" more steps. Global steps cannot be included in a load group with\"\n \" more steps as they will be loaded in a different stage to the\"\n \" rest of the steps in the pipeline by default.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n if step_name in steps_included_in_load_group:\n raise DistilabelUserError(\n f\"Step with name '{step_name}' in load group {load_group_num} has\"\n \" already been included in a previous load group. A step cannot be in more\"\n \" than one load group.\",\n page=\"sections/how_to_guides/advanced/load_groups_and_execution_stages\",\n )\n\n steps_included_in_load_group.append(step_name)\n\n def _init_steps_load_status(self) -> None:\n \"\"\"Initialize the `_steps_load_status` dictionary assigning 0 to every step of\n the pipeline.\"\"\"\n for step_name in self.dag:\n self._steps_load_status[step_name] = _STEP_NOT_LOADED_CODE\n\n def _set_pipeline_artifacts_path_in_steps(self) -> None:\n \"\"\"Sets the attribute `_pipeline_artifacts_path` in all the `Step`s of the pipeline,\n so steps can use it to get the path to save the generated artifacts.\"\"\"\n artifacts_path = self._cache_location[\"data\"] / constants.STEPS_ARTIFACTS_PATH\n for name in self.dag:\n step: \"_Step\" = self.dag.get_step(name)[constants.STEP_ATTR_NAME]\n step.set_pipeline_artifacts_path(path=artifacts_path)\n\n def _check_requirements(self) -> None:\n \"\"\"Checks if the dependencies required to run the pipeline are installed.\n\n Raises:\n ModuleNotFoundError: if one or more requirements are missing.\n \"\"\"\n if to_install := self.requirements_to_install():\n # Print the list of requirements like they would appear in a requirements.txt\n to_install_list = \"\\n\" + \"\\n\".join(to_install)\n msg = f\"Please install the following requirements to run the pipeline: {to_install_list}\"\n self._logger.error(msg)\n raise ModuleNotFoundError(msg)\n\n def _setup_fsspec(\n self, storage_parameters: Optional[Dict[str, Any]] = None\n ) -> None:\n \"\"\"Setups the `fsspec` filesystem to be used to store the data of the `_Batch`es\n passed between the steps.\n\n Args:\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n \"\"\"\n if not storage_parameters:\n self._fs = fsspec.filesystem(\"file\")\n self._storage_base_path = (\n f\"file://{self._cache_location['batch_input_data']}\"\n )\n return\n\n if \"path\" not in storage_parameters:\n raise DistilabelUserError(\n \"The 'path' key must be present in the `storage_parameters` dictionary\"\n \" if it's not `None`.\",\n page=\"sections/how_to_guides/advanced/fs_to_pass_data/\",\n )\n\n path = storage_parameters.pop(\"path\")\n protocol = UPath(path).protocol\n\n self._fs = fsspec.filesystem(protocol, **storage_parameters)\n self._storage_base_path = path\n\n def _add_step(self, step: \"_Step\") -> None:\n \"\"\"Add a step to the pipeline.\n\n Args:\n step: The step to be added to the pipeline.\n \"\"\"\n self.dag.add_step(step)\n\n def _add_edge(self, from_step: str, to_step: str) -> None:\n \"\"\"Add an edge between two steps in the pipeline.\n\n Args:\n from_step: The name of the step that will generate the input for `to_step`.\n to_step: The name of the step that will receive the input from `from_step`.\n \"\"\"\n self.dag.add_edge(from_step, to_step)\n\n # Check if `from_step` has a `routing_batch_function`. If it does, then mark\n # `to_step` as a step that will receive a routed batch.\n node = self.dag.get_step(from_step) # type: ignore\n routing_batch_function = node.get(\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME, None\n )\n self.dag.set_step_attr(\n name=to_step,\n attr=constants.RECEIVES_ROUTED_BATCHES_ATTR_NAME,\n value=routing_batch_function is not None,\n )\n\n def _is_convergence_step(self, step_name: str) -> None:\n \"\"\"Checks if a step is a convergence step.\n\n Args:\n step_name: The name of the step.\n \"\"\"\n return self.dag.get_step(step_name).get(constants.CONVERGENCE_STEP_ATTR_NAME)\n\n def _add_routing_batch_function(\n self, step_name: str, routing_batch_function: \"RoutingBatchFunction\"\n ) -> None:\n \"\"\"Add a routing batch function to a step.\n\n Args:\n step_name: The name of the step that will receive the routed batch.\n routing_batch_function: The function that will route the batch to the step.\n \"\"\"\n self.dag.set_step_attr(\n name=step_name,\n attr=constants.ROUTING_BATCH_FUNCTION_ATTR_NAME,\n value=routing_batch_function,\n )\n\n def _set_runtime_parameters(self, parameters: Dict[str, Dict[str, Any]]) -> None:\n \"\"\"Set the runtime parameters for the steps in the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the parameter name as the key and the parameter value as the value.\n \"\"\"\n step_names = set(self.dag.G)\n for step_name, step_parameters in parameters.items():\n if step_name not in step_names:\n self._logger.warning(\n f\"\u2753 Step '{step_name}' provided in `Pipeline.run(parameters={{...}})` not found in the pipeline.\"\n f\" Available steps are: {step_names}.\"\n )\n else:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n step.set_runtime_parameters(step_parameters)\n\n def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the DAG content to a dict.\n\n Args:\n obj (Any): Unused, just kept to match the signature of the parent method.\n kwargs (Any): Unused, just kept to match the signature of the parent method.\n\n Returns:\n Dict[str, Any]: Internal representation of the DAG from networkx in a serializable format.\n \"\"\"\n return self.dag.dump()\n\n def draw(\n self,\n path: Optional[Union[str, Path]] = \"pipeline.png\",\n top_to_bottom: bool = False,\n show_edge_labels: bool = True,\n ) -> str:\n \"\"\"\n Draws the pipeline.\n\n Parameters:\n path: The path to save the image to.\n top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n Returns:\n The path to the saved image.\n \"\"\"\n png = self.dag.draw(\n top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n )\n with open(path, \"wb\") as f:\n f.write(png)\n return path\n\n def __repr__(self) -> str:\n \"\"\"\n If running in a Jupyter notebook, display an image representing this `Pipeline`.\n \"\"\"\n if in_notebook():\n try:\n from IPython.display import Image, display\n\n image_data = self.dag.draw()\n\n display(Image(image_data))\n except Exception:\n pass\n return super().__repr__()\n\n def dump(self, **kwargs: Any) -> Dict[str, Any]:\n return {\n \"distilabel\": {\"version\": __version__},\n \"pipeline\": {\n \"name\": self.name,\n \"description\": self.description,\n **super().dump(),\n },\n \"requirements\": self.requirements,\n }\n\n @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n Note:\n It's intended for internal use.\n\n Args:\n data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n Returns:\n BasePipeline: Pipeline recreated from the dictionary info.\n \"\"\"\n name = data[\"pipeline\"][\"name\"]\n description = data[\"pipeline\"].get(\"description\")\n requirements = data.get(\"requirements\", [])\n with cls(name=name, description=description, requirements=requirements) as pipe:\n pipe.dag = DAG.from_dict(data[\"pipeline\"])\n return pipe\n\n @property\n def _cache_location(self) -> \"_CacheLocation\":\n \"\"\"Dictionary containing the object that will stored and the location,\n whether it is a filename or a folder.\n\n Returns:\n Path: Filenames where the pipeline content will be serialized.\n \"\"\"\n folder = self._cache_dir / self.name / self.signature\n pipeline_execution_dir = folder / \"executions\" / self.aggregated_steps_signature\n return {\n \"pipeline\": pipeline_execution_dir / \"pipeline.yaml\",\n \"batch_manager\": pipeline_execution_dir / \"batch_manager.json\",\n \"steps_data\": self._cache_dir / self.name / \"steps_data\",\n \"data\": pipeline_execution_dir / \"data\",\n \"batch_input_data\": pipeline_execution_dir / \"batch_input_data\",\n \"log_file\": pipeline_execution_dir / \"pipeline.log\",\n \"stages_file\": pipeline_execution_dir / \"stages.json\",\n }\n\n @property\n def aggregated_steps_signature(self) -> str:\n \"\"\"Creates an aggregated signature using `Step`s signature that will be used for\n the `_BatchManager`.\n\n Returns:\n The aggregated signature.\n \"\"\"\n signatures = []\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n signatures.append(step.signature)\n return hashlib.sha1(\"\".join(signatures).encode()).hexdigest()\n\n def _cache(self) -> None:\n \"\"\"Saves the `BasePipeline` using the `_cache_filename`.\"\"\"\n if self._dry_run:\n return\n\n self.save(\n path=self._cache_location[\"pipeline\"],\n format=self._cache_location[\"pipeline\"].suffix.replace(\".\", \"\"), # type: ignore\n )\n\n if self._batch_manager is not None:\n self._batch_manager.cache(\n path=self._cache_location[\"batch_manager\"],\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n\n self._save_stages_status()\n\n self._logger.debug(\"Pipeline and batch manager saved to cache.\")\n\n def _save_stages_status(self) -> None:\n \"\"\"Saves the stages status to cache.\"\"\"\n self.save(\n path=self._cache_location[\"stages_file\"],\n format=\"json\",\n dump={\n \"current_stage\": self._current_stage,\n \"stages_last_batch\": self._stages_last_batch,\n },\n )\n\n def _get_steps_load_stages(self) -> Tuple[List[List[str]], List[List[str]]]:\n return self.dag.get_steps_load_stages(self._load_groups)\n\n def _load_stages_status(self, use_cache: bool = True) -> None:\n \"\"\"Try to load the stages status from cache, or initialize it if cache file doesn't\n exist or cache is not going to be used.\"\"\"\n if use_cache and self._cache_location[\"stages_file\"].exists():\n stages_status = read_json(self._cache_location[\"stages_file\"])\n self._current_stage = stages_status[\"current_stage\"]\n self._stages_last_batch = stages_status[\"stages_last_batch\"]\n else:\n self._current_stage = 0\n self._stages_last_batch = [\n [] for _ in range(len(self._get_steps_load_stages()[0]))\n ]\n\n def _refresh_pipeline_from_cache(self) -> None:\n \"\"\"Refresh the DAG (and its steps) from the cache file. This is useful as some\n `Step`s can update and change their state during the pipeline execution, and this\n method will make sure the pipeline is up-to-date with the latest changes when\n the pipeline is reloaded from cache.\n \"\"\"\n\n def recursively_handle_secrets_and_excluded_attributes(\n cached_model: \"BaseModel\", model: \"BaseModel\"\n ) -> None:\n \"\"\"Recursively handle the secrets and excluded attributes of a `BaseModel`,\n setting the values of the cached model to the values of the model.\n\n Args:\n cached_model: The cached model that will be updated as it doesn't contain\n the secrets and excluded attributes (not serialized).\n model: The model that contains the secrets and excluded attributes because\n it comes from pipeline instantiation.\n \"\"\"\n for field_name, field_info in cached_model.model_fields.items():\n if field_name in (\"pipeline\"):\n continue\n\n inner_type = extract_annotation_inner_type(field_info.annotation)\n if is_type_pydantic_secret_field(inner_type) or field_info.exclude:\n setattr(cached_model, field_name, getattr(model, field_name))\n elif isclass(inner_type) and issubclass(inner_type, BaseModel):\n recursively_handle_secrets_and_excluded_attributes(\n getattr(cached_model, field_name),\n getattr(model, field_name),\n )\n\n if self._cache_location[\"pipeline\"].exists():\n cached_dag = self.from_yaml(self._cache_location[\"pipeline\"]).dag\n\n for step_name in cached_dag:\n step_cached: \"_Step\" = cached_dag.get_step(step_name)[\n constants.STEP_ATTR_NAME\n ]\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n recursively_handle_secrets_and_excluded_attributes(step_cached, step)\n\n self.dag = cached_dag\n\n def _load_batch_manager(self, use_cache: bool = True) -> None:\n \"\"\"Will try to load the `_BatchManager` from the cache dir if found. Otherwise,\n it will create one from scratch.\n\n If the `_BatchManager` is loaded from cache, we check for invalid steps (those that\n may have a different signature than the original in the pipeline folder), and\n restart them, as well as their successors.\n\n Args:\n use_cache: whether the cache should be used or not.\n \"\"\"\n batch_manager_cache_loc = self._cache_location[\"batch_manager\"]\n\n # This first condition handles the case in which the pipeline is exactly the same\n # no steps have been added, removed or changed.\n if use_cache and batch_manager_cache_loc.exists():\n self._logger.info(\n f\"\ud83d\udcbe Loading `_BatchManager` from cache: '{batch_manager_cache_loc}'\"\n )\n self._batch_manager = _BatchManager.load_from_cache(\n dag=self.dag,\n batch_manager_path=batch_manager_cache_loc,\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n self._invalidate_steps_cache_if_required()\n # In this other case, the pipeline has been changed. We need to create a new batch\n # manager and if `use_cache==True` then check which outputs have we computed and\n # cached for steps that haven't changed but that were executed in another pipeline,\n # and therefore we can reuse\n else:\n self._batch_manager = _BatchManager.from_dag(\n dag=self.dag,\n use_cache=use_cache,\n steps_data_path=self._cache_location[\"steps_data\"],\n )\n\n def _invalidate_steps_cache_if_required(self) -> None:\n \"\"\"Iterates over the steps of the pipeline and invalidates their cache if required.\"\"\"\n for step_name in self.dag:\n # `GeneratorStep`s doesn't receive input data so no need to check their\n # `_BatchManagerStep`\n if self.dag.get_step(step_name)[constants.STEP_ATTR_NAME].is_generator:\n continue\n\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if not step.use_cache:\n self._batch_manager.invalidate_cache_for(\n step_name=step.name,\n dag=self.dag,\n steps_data_path=self._cache_location[\"steps_data\"],\n ) # type: ignore\n self._logger.info(\n f\"\u267b\ufe0f Step '{step.name}' won't use cache (`use_cache=False`). The cache of this step and their successors won't be\"\n \" reused and the results will have to be recomputed.\"\n )\n break\n\n def _setup_write_buffer(self, use_cache: bool = True) -> None:\n \"\"\"Setups the `_WriteBuffer` that will store the data of the leaf steps of the\n pipeline while running, so the `Distiset` can be created at the end.\n \"\"\"\n if not use_cache and self._cache_location[\"data\"].exists():\n shutil.rmtree(self._cache_location[\"data\"])\n buffer_data_path = self._cache_location[\"data\"] / constants.STEPS_OUTPUTS_PATH\n self._logger.info(f\"\ud83d\udcdd Pipeline data will be written to '{buffer_data_path}'\")\n self._write_buffer = _WriteBuffer(\n buffer_data_path,\n self.dag.leaf_steps,\n steps_cached={\n step_name: self.dag.get_step(step_name)[\n constants.STEP_ATTR_NAME\n ].use_cache\n for step_name in self.dag\n },\n )\n\n def _print_load_stages_info(self) -> None:\n \"\"\"Prints the information about the load stages.\"\"\"\n stages, _ = self._get_steps_load_stages()\n msg = \"\"\n for stage, steps in enumerate(stages):\n steps_to_be_loaded = self._steps_to_be_loaded_in_stage(stage)\n msg += f\"\\n * Stage {stage}:\"\n for step_name in steps:\n step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n if step.is_generator:\n emoji = \"\ud83d\udeb0\"\n elif step.is_global:\n emoji = \"\ud83c\udf10\"\n else:\n emoji = \"\ud83d\udd04\"\n msg += f\"\\n - {emoji} '{step_name}'\"\n if step_name not in steps_to_be_loaded:\n msg += \" (results cached, won't be loaded and executed)\"\n legend = \"\\n * Legend: \ud83d\udeb0 GeneratorStep \ud83c\udf10 GlobalStep \ud83d\udd04 Step\"\n self._logger.info(\n f\"\u231b The steps of the pipeline will be loaded in stages:{legend}{msg}\"\n )\n\n def _run_output_queue_loop_in_thread(self) -> threading.Thread:\n \"\"\"Runs the output queue loop in a separate thread to receive the output batches\n from the steps. This is done to avoid the signal handler to block the loop, which\n would prevent the pipeline from stopping correctly.\"\"\"\n thread = threading.Thread(target=self._output_queue_loop)\n thread.start()\n return thread\n\n def _output_queue_loop(self) -> None:\n \"\"\"Loop to receive the output batches from the steps and manage the flow of the\n batches through the pipeline.\"\"\"\n self._create_steps_input_queues()\n\n if not self._initialize_pipeline_execution():\n return\n\n while self._should_continue_processing(): # type: ignore\n self._logger.debug(\"Waiting for output batch from step...\")\n if (batch := self._output_queue.get()) is None:\n self._logger.debug(\"Received `None` from output queue. Breaking loop.\")\n break\n\n self._logger.debug(\n f\"Received batch with seq_no {batch.seq_no} from step '{batch.step_name}'\"\n f\" from output queue: {batch}\"\n )\n\n self._process_batch(batch)\n\n # If `_stop_called` was set to `True` while waiting for the output queue, then\n # we need to handle the stop of the pipeline and break the loop to avoid\n # propagating the batches through the pipeline and making the stop process\n # slower.\n with self._stop_called_lock:\n if self._stop_called:\n self._handle_batch_on_stop(batch)\n break\n\n # If there is another load stage and all the `last_batch`es from the stage\n # have been received, then load the next stage.\n if self._should_load_next_stage():\n self._wait_current_stage_to_finish()\n if not self._update_stage():\n break\n\n self._manage_batch_flow(batch)\n\n self._finalize_pipeline_execution()\n\n def _create_steps_input_queues(self) -> None:\n \"\"\"Creates the input queue for all the steps in the pipeline.\"\"\"\n for step_name in self.dag:\n self._logger.debug(f\"Creating input queue for '{step_name}' step...\")\n input_queue = self._create_step_input_queue(step_name)\n self._steps_input_queues[step_name] = input_queue\n\n def _initialize_pipeline_execution(self) -> bool:\n \"\"\"Load the steps of the required stage to initialize the pipeline execution,\n and requests the initial batches to trigger the batch flowing in the pipeline.\n\n Returns:\n `True` if initialization went OK, `False` otherwise.\n \"\"\"\n # Wait for all the steps to be loaded correctly\n if not self._run_stage_steps_and_wait(stage=self._current_stage):\n self._set_steps_not_loaded_exception()\n return False\n\n # Send the \"first\" batches to the steps so the batches starts flowing through\n # the input queues and output queue\n self._request_initial_batches()\n\n return True\n\n def _should_continue_processing(self) -> bool:\n \"\"\"Condition for the consume batches from the `output_queue` loop.\n\n Returns:\n `True` if should continue consuming batches, `False` otherwise and the pipeline\n should stop.\n \"\"\"\n with self._stop_called_lock:\n return self._batch_manager.can_generate() and not self._stop_called # type: ignore\n\n def _process_batch(\n self, batch: \"_Batch\", send_last_batch_flag: bool = True\n ) -> None:\n \"\"\"Process a batch consumed from the `output_queue`.\n\n Args:\n batch: the batch to be processed.\n \"\"\"\n if batch.data_path:\n self._logger.debug(\n f\"Reading {batch.seq_no} batch data from '{batch.step_name}': '{batch.data_path}'\"\n )\n batch.read_batch_data_from_fs()\n\n if batch.step_name in self.dag.leaf_steps:\n self._write_buffer.add_batch(batch) # type: ignore\n\n if batch.last_batch:\n self._register_stages_last_batch(batch)\n\n # Make sure to send the `LAST_BATCH_SENT_FLAG` to the predecessors of the step\n # if the batch is the last one, so they stop their processing loop even if they\n # haven't received the last batch because of the routing function.\n if send_last_batch_flag:\n for step_name in self.dag.get_step_predecessors(batch.step_name):\n if self._is_step_running(step_name):\n self._send_last_batch_flag_to_step(step_name)\n\n def _set_step_for_recovering_offline_batch_generation(\n self, step: \"_Step\", data: List[List[Dict[str, Any]]]\n ) -> None:\n \"\"\"Sets the required information to recover a pipeline execution from a `_Step`\n that used an `LLM` with offline batch generation.\n\n Args:\n step: The `_Step` that used an `LLM` with offline batch generation.\n data: The data that was used to generate the batches for the step.\n \"\"\"\n # Replace step so the attribute `jobs_ids` of the `LLM` is not lost, as it was\n # updated in the child process but not in the main process.\n step_name: str = step.name # type: ignore\n self.dag.set_step_attr(\n name=step_name, attr=constants.STEP_ATTR_NAME, value=step\n )\n self._recover_offline_batch_generate_for_step = (step_name, data)\n\n def _add_batch_for_recovering_offline_batch_generation(self) -> None:\n \"\"\"Adds a dummy `_Batch` to the specified step name (it's a `Task` that used an\n `LLM` with offline batch generation) to recover the pipeline state for offline\n batch generation in next pipeline executions.\"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n if self._recover_offline_batch_generate_for_step is None:\n return\n\n step_name, data = self._recover_offline_batch_generate_for_step\n self._logger.debug(\n f\"Adding batch to '{step_name}' step to recover pipeline execution for offline\"\n \" batch generation...\"\n )\n self._batch_manager.add_batch_to_recover_offline_batch_generation(\n to_step=step_name,\n data=data,\n )\n\n def _register_stages_last_batch(self, batch: \"_Batch\") -> None:\n \"\"\"Registers the last batch received from a step in the `_stages_last_batch`\n dictionary.\n\n Args:\n batch: The last batch received from a step.\n \"\"\"\n _, stages_last_steps = self._get_steps_load_stages()\n stage_last_steps = stages_last_steps[self._current_stage]\n if batch.step_name in stage_last_steps:\n self._stages_last_batch[self._current_stage].append(batch.step_name)\n self._stages_last_batch[self._current_stage].sort()\n\n def _update_stage(self) -> bool:\n \"\"\"Checks if the steps of next stage should be loaded and updates `_current_stage`\n attribute.\n\n Returns:\n `True` if updating the stage went OK, `False` otherwise.\n \"\"\"\n self._current_stage += 1\n if not self._run_stage_steps_and_wait(stage=self._current_stage):\n self._set_steps_not_loaded_exception()\n return False\n\n return True\n\n def _should_load_next_stage(self) -> bool:\n \"\"\"Returns if the next stage should be loaded.\n\n Returns:\n `True` if the next stage should be loaded, `False` otherwise.\n \"\"\"\n _, stage_last_steps = self._get_steps_load_stages()\n there_is_next_stage = self._current_stage + 1 < len(stage_last_steps)\n stage_last_batches_received = (\n self._stages_last_batch[self._current_stage]\n == stage_last_steps[self._current_stage]\n )\n return there_is_next_stage and stage_last_batches_received\n\n def _finalize_pipeline_execution(self) -> None:\n \"\"\"Finalizes the pipeline execution handling the prematurely stop of the pipeline\n if required, caching the data and ensuring that all the steps finish its execution.\"\"\"\n\n # Send `None` to steps `input_queue`s just in case some step is still waiting\n self._notify_steps_to_stop()\n\n for step_name in self.dag:\n while self._is_step_running(step_name):\n self._logger.debug(f\"Waiting for step '{step_name}' to finish...\")\n time.sleep(0.5)\n\n with self._stop_called_lock:\n if self._stop_called:\n self._handle_stop()\n\n # Reset flag state\n self._stop_called = False\n\n self._add_batch_for_recovering_offline_batch_generation()\n\n self._cache()\n\n def _run_load_queue_loop_in_thread(self) -> threading.Thread:\n \"\"\"Runs a background thread that reads from the `load_queue` to update the status\n of the number of replicas loaded for each step.\n\n Returns:\n The thread that was started.\n \"\"\"\n thread = threading.Thread(target=self._run_load_queue_loop)\n thread.start()\n return thread\n\n def _run_load_queue_loop(self) -> None:\n \"\"\"Runs a loop that reads from the `load_queue` to update the status of the number\n of replicas loaded for each step.\"\"\"\n\n while True:\n if (load_info := self._load_queue.get()) is None:\n self._logger.debug(\"Received `None` from load queue. Breaking loop.\")\n break\n\n with self._steps_load_status_lock:\n step_name, status = load_info[\"name\"], load_info[\"status\"]\n if status == \"loaded\":\n if self._steps_load_status[step_name] == _STEP_NOT_LOADED_CODE:\n self._steps_load_status[step_name] = 1\n else:\n self._steps_load_status[step_name] += 1\n elif status == \"unloaded\":\n self._steps_load_status[step_name] -= 1\n if self._steps_load_status[step_name] == 0:\n self._steps_load_status[step_name] = _STEP_UNLOADED_CODE\n else:\n # load failed\n self._steps_load_status[step_name] = _STEP_LOAD_FAILED_CODE\n\n self._logger.debug(\n f\"Step '{step_name}' loaded replicas: {self._steps_load_status[step_name]}\"\n )\n\n def _is_step_running(self, step_name: str) -> bool:\n \"\"\"Checks if the step is running (at least one replica is running).\n\n Args:\n step_name: The step to be check if running.\n\n Returns:\n `True` if the step is running, `False` otherwise.\n \"\"\"\n with self._steps_load_status_lock:\n return self._steps_load_status[step_name] >= 1\n\n def _steps_to_be_loaded_in_stage(self, stage: int) -> List[str]:\n \"\"\"Returns the list of steps of the provided stage that should be loaded taking\n into account if they have finished.\n\n Args:\n stage: the stage number\n\n Returns:\n A list containing the name of the steps that should be loaded in this stage.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n steps_stages, _ = self._get_steps_load_stages()\n\n return [\n step\n for step in steps_stages[stage]\n if not self._batch_manager.step_has_finished(step)\n ]\n\n def _get_steps_load_status(self, steps: List[str]) -> Dict[str, int]:\n \"\"\"Gets the a dictionary containing the load status of the provided steps.\n\n Args:\n steps: a list containing the names of the steps to get their load status.\n\n Returns:\n A dictionary containing the load status of the provided steps.\n \"\"\"\n return {\n step_name: replicas\n for step_name, replicas in self._steps_load_status.items()\n if step_name in steps\n }\n\n def _wait_current_stage_to_finish(self) -> None:\n \"\"\"Waits for the current stage to finish.\"\"\"\n stage = self._current_stage\n steps = self._steps_to_be_loaded_in_stage(stage)\n self._logger.info(f\"\u23f3 Waiting for stage {stage} to finish...\")\n with self._stop_called_lock:\n while not self._stop_called:\n filtered_steps_load_status = self._get_steps_load_status(steps)\n if all(\n replicas == _STEP_UNLOADED_CODE\n for replicas in filtered_steps_load_status.values()\n ):\n self._logger.info(f\"\u2705 Stage {stage} has finished!\")\n break\n\n def _run_stage_steps_and_wait(self, stage: int) -> bool:\n \"\"\"Runs the steps of the specified stage and waits for them to be ready.\n\n Args:\n stage: the stage from which the steps have to be loaded.\n\n Returns:\n `True` if all the steps have been loaded correctly, `False` otherwise.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n steps = self._steps_to_be_loaded_in_stage(stage)\n self._logger.debug(f\"Steps to be loaded in stage {stage}: {steps}\")\n\n # Run the steps of the stage\n self._run_steps(steps=steps)\n\n # Wait for them to be ready\n self._logger.info(f\"\u23f3 Waiting for all the steps of stage {stage} to load...\")\n previous_message = None\n with self._stop_called_lock:\n while not self._stop_called:\n with self._steps_load_status_lock:\n filtered_steps_load_status = self._get_steps_load_status(steps)\n self._logger.debug(\n f\"Steps from stage {stage} loaded: {filtered_steps_load_status}\"\n )\n\n if any(\n replicas_loaded == _STEP_LOAD_FAILED_CODE\n for replicas_loaded in filtered_steps_load_status.values()\n ):\n self._logger.error(\n f\"\u274c Failed to load all the steps of stage {stage}\"\n )\n return False\n\n num_steps_loaded = 0\n replicas_message = \"\"\n for step_name, replicas in filtered_steps_load_status.items():\n step_replica_count = self.dag.get_step_replica_count(step_name)\n # It can happen that the step is very fast and it has done all the\n # work and have finished its execution before checking if it has\n # been loaded, that's why we also considered the step to be loaded\n # if `_STEP_UNLOADED_CODE`.\n if (\n replicas == step_replica_count\n or replicas == _STEP_UNLOADED_CODE\n ):\n num_steps_loaded += 1\n replicas_message += f\"\\n * '{step_name}' replicas: {max(0, replicas)}/{step_replica_count}\"\n\n message = f\"\u23f3 Steps from stage {stage} loaded: {num_steps_loaded}/{len(filtered_steps_load_status)}{replicas_message}\"\n if num_steps_loaded > 0 and message != previous_message:\n self._logger.info(message)\n previous_message = message\n\n if num_steps_loaded == len(filtered_steps_load_status):\n self._logger.info(\n f\"\u2705 All the steps from stage {stage} have been loaded!\"\n )\n return True\n\n time.sleep(2.5)\n\n return not self._stop_called\n\n def _handle_stop(self) -> None:\n \"\"\"Handles the stop of the pipeline execution, which will stop the steps from\n processing more batches and wait for the output queue to be empty, to not lose\n any data that was already processed by the steps before the stop was called.\"\"\"\n self._logger.debug(\"Handling stop of the pipeline execution...\")\n\n self._add_batches_back_to_batch_manager()\n\n # Wait for the input queue to be empty, which means that all the steps finished\n # processing the batches that were sent before the stop flag.\n self._wait_steps_input_queues_empty()\n\n self._consume_output_queue()\n\n if self._should_load_next_stage():\n self._current_stage += 1\n\n def _wait_steps_input_queues_empty(self) -> None:\n self._logger.debug(\"Waiting for steps input queues to be empty...\")\n for step_name in self.dag:\n self._wait_step_input_queue_empty(step_name)\n self._logger.debug(\"Steps input queues are empty!\")\n\n def _wait_step_input_queue_empty(self, step_name: str) -> Union[\"Queue[Any]\", None]:\n \"\"\"Waits for the input queue of a step to be empty.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n The input queue of the step if it's not loaded or finished, `None` otherwise.\n \"\"\"\n if self._check_step_not_loaded_or_finished(step_name):\n return None\n\n if input_queue := self.dag.get_step(step_name).get(\n constants.INPUT_QUEUE_ATTR_NAME\n ):\n while input_queue.qsize() != 0:\n pass\n return input_queue\n\n def _check_step_not_loaded_or_finished(self, step_name: str) -> bool:\n \"\"\"Checks if a step is not loaded or already finished.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n `True` if the step is not loaded or already finished, `False` otherwise.\n \"\"\"\n with self._steps_load_status_lock:\n num_replicas = self._steps_load_status[step_name]\n\n # The step has finished (replicas = 0) or it has failed to load\n if num_replicas in [0, _STEP_LOAD_FAILED_CODE, _STEP_UNLOADED_CODE]:\n return True\n\n return False\n\n @property\n @abstractmethod\n def QueueClass(self) -> Callable:\n \"\"\"The class of the queue to use in the pipeline.\"\"\"\n pass\n\n def _create_step_input_queue(self, step_name: str) -> \"Queue[Any]\":\n \"\"\"Creates an input queue for a step.\n\n Args:\n step_name: The name of the step.\n\n Returns:\n The input queue created.\n \"\"\"\n input_queue = self.QueueClass()\n self.dag.set_step_attr(step_name, constants.INPUT_QUEUE_ATTR_NAME, input_queue)\n return input_queue\n\n @abstractmethod\n def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n \"\"\"Runs the `Step` instance.\n\n Args:\n step: The `Step` instance to run.\n input_queue: The input queue where the step will receive the batches.\n replica: The replica ID assigned.\n \"\"\"\n pass\n\n def _run_steps(self, steps: Iterable[str]) -> None:\n \"\"\"Runs the `Step`s of the pipeline, creating first an input queue for each step\n that will be used to send the batches.\n\n Args:\n steps:\n \"\"\"\n for step_name in steps:\n step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n input_queue = self._steps_input_queues[step.name] # type: ignore\n\n # Set `pipeline` to `None` as in some Python environments the pipeline is not\n # picklable and it will raise an error when trying to send the step to the process.\n # `TypeError: cannot pickle 'code' object`\n step.pipeline = None\n\n if not step.is_normal and step.resources.replicas > 1: # type: ignore\n self._logger.warning(\n f\"Step '{step_name}' is a `GeneratorStep` or `GlobalStep` and has more\"\n \" than 1 replica. Only `Step` instances can have more than 1 replica.\"\n \" The number of replicas for the step will be set to 1.\"\n )\n\n step_num_replicas: int = step.resources.replicas if step.is_normal else 1 # type: ignore\n for replica in range(step_num_replicas):\n self._logger.debug(\n f\"Running 1 replica of step '{step.name}' with ID {replica}...\"\n )\n self._run_step(\n step=step.model_copy(deep=True),\n input_queue=input_queue,\n replica=replica,\n )\n\n def _add_batches_back_to_batch_manager(self) -> None:\n \"\"\"Add the `Batch`es that were sent to a `Step` back to the `_BatchManager`. This\n method should be used when the pipeline has been stopped prematurely.\"\"\"\n self._logger.debug(\n \"Adding batches from step input queues back to the batch manager...\"\n )\n for step_name in self.dag:\n node = self.dag.get_step(step_name)\n step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n if step.is_generator:\n continue\n if input_queue := node.get(constants.INPUT_QUEUE_ATTR_NAME):\n while not input_queue.empty():\n batch = input_queue.get()\n if not isinstance(batch, _Batch):\n continue\n self._batch_manager.add_batch( # type: ignore\n to_step=step_name,\n batch=batch,\n prepend=True,\n )\n self._logger.debug(\n f\"Adding batch back to the batch manager: {batch}\"\n )\n if self._check_step_not_loaded_or_finished(step_name):\n # Notify the step to stop\n input_queue.put(None)\n self._logger.debug(\"Finished adding batches back to the batch manager.\")\n\n def _consume_output_queue(self) -> None:\n \"\"\"Consumes the `Batch`es from the output queue until it's empty. This method should\n be used when the pipeline has been stopped prematurely to consume and to not lose\n the `Batch`es that were processed by the leaf `Step`s before stopping the pipeline.\"\"\"\n while not self._output_queue.empty():\n batch = self._output_queue.get()\n if batch is None:\n continue\n self._process_batch(batch, send_last_batch_flag=False)\n self._handle_batch_on_stop(batch)\n\n def _manage_batch_flow(self, batch: \"_Batch\") -> None:\n \"\"\"Checks if the step that generated the batch has more data in its buffer to\n generate a new batch. If there's data, then a new batch is sent to the step. If\n the step has no data in its buffer, then the predecessors generator steps are\n requested to send a new batch.\n\n Args:\n batch: The batch that was processed.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n route_to, do_not_route_to, routed = self._get_successors(batch)\n\n self._register_batch(batch)\n\n # Keep track of the steps that the batch was routed to\n if routed:\n batch.batch_routed_to = route_to\n\n self._set_next_expected_seq_no(\n steps=do_not_route_to,\n from_step=batch.step_name,\n next_expected_seq_no=batch.seq_no + 1,\n )\n\n step = self._get_step_from_batch(batch)\n\n # Add the batch to the successors input buffers\n for successor in route_to:\n # Copy batch to avoid modifying the same reference in the batch manager\n batch_to_add = batch.copy() if len(route_to) > 1 else batch\n\n self._batch_manager.add_batch(successor, batch_to_add)\n\n # Check if the step is a generator and if there are successors that need data\n # from this step. This usually happens when the generator `batch_size` is smaller\n # than the `input_batch_size` of the successor steps.\n if (\n step.is_generator\n and step.name in self._batch_manager.step_empty_buffers(successor)\n ):\n last_batch_sent = self._batch_manager.get_last_batch_sent(step.name)\n self._send_batch_to_step(last_batch_sent.next_batch()) # type: ignore\n\n # If successor step has enough data in its buffer to create a new batch, then\n # send the batch to the step.\n while new_batch := self._batch_manager.get_batch(successor):\n self._send_batch_to_step(new_batch)\n\n if not step.is_generator:\n # Step (\"this\", the one from which the batch was received) has enough data on its\n # buffers to create a new batch\n while new_batch := self._batch_manager.get_batch(step.name): # type: ignore\n self._send_batch_to_step(new_batch)\n else:\n self._request_more_batches_if_needed(step)\n else:\n # Case in which the pipeline only contains a `GeneratorStep` so we constanly keep\n # requesting batch after batch as there is no downstream step to consume it\n if len(self.dag) == 1:\n self._request_batch_from_generator(step.name) # type: ignore\n\n self._cache()\n\n def _send_to_step(self, step_name: str, to_send: Any) -> None:\n \"\"\"Sends something to the input queue of a step.\n\n Args:\n step_name: The name of the step.\n to_send: The object to send.\n \"\"\"\n input_queue = self.dag.get_step(step_name)[constants.INPUT_QUEUE_ATTR_NAME]\n input_queue.put(to_send)\n\n def _send_batch_to_step(self, batch: \"_Batch\") -> None:\n \"\"\"Sends a batch to the input queue of a step, writing the data of the batch\n to the filesystem and setting `batch.data_path` with the path where the data\n was written (if requiered i.e. the step is a global step or `use_fs_to_pass_data`)\n\n This method should be extended by the specific pipeline implementation, adding\n the logic to send the batch to the step.\n\n Args:\n batch: The batch to send.\n \"\"\"\n self._logger.debug(\n f\"Setting batch {batch.seq_no} as last batch sent to '{batch.step_name}': {batch}\"\n )\n self._batch_manager.set_last_batch_sent(batch) # type: ignore\n\n step: \"_Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n if not step.is_generator and (step.is_global or self._use_fs_to_pass_data):\n base_path = UPath(self._storage_base_path) / step.name # type: ignore\n self._logger.debug(\n f\"Writing {batch.seq_no} batch for '{batch.step_name}' step to filesystem: {base_path}\"\n )\n batch.write_batch_data_to_fs(self._fs, base_path) # type: ignore\n\n self._logger.debug(\n f\"Sending batch {batch.seq_no} to step '{batch.step_name}': {batch}\"\n )\n self._send_to_step(batch.step_name, batch)\n\n def _gather_requirements(self) -> List[str]:\n \"\"\"Extracts the requirements from the steps to be used in the pipeline.\n\n Returns:\n List of requirements gathered from the steps.\n \"\"\"\n steps_requirements = []\n for step in self.dag:\n step_req = self.dag.get_step(step)[constants.STEP_ATTR_NAME].requirements\n steps_requirements.extend(step_req)\n\n return steps_requirements\n\n def _register_batch(self, batch: \"_Batch\") -> None:\n \"\"\"Registers a batch in the batch manager.\n\n Args:\n batch: The batch to register.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n self._batch_manager.register_batch(\n batch, steps_data_path=self._cache_location[\"steps_data\"]\n ) # type: ignore\n self._logger.debug(\n f\"Batch {batch.seq_no} from step '{batch.step_name}' registered in batch\"\n \" manager\"\n )\n\n def _send_last_batch_flag_to_step(self, step_name: str) -> None:\n \"\"\"Sends the `LAST_BATCH_SENT_FLAG` to a step to stop processing batches.\n\n Args:\n step_name: The name of the step.\n \"\"\"\n self._logger.debug(\n f\"Sending `LAST_BATCH_SENT_FLAG` to '{step_name}' step to stop processing\"\n \" batches...\"\n )\n\n for _ in range(self.dag.get_step_replica_count(step_name)):\n self._send_to_step(step_name, constants.LAST_BATCH_SENT_FLAG)\n self._batch_manager.set_last_batch_flag_sent_to(step_name) # type: ignore\n\n def _request_initial_batches(self) -> None:\n \"\"\"Requests the initial batches to the generator steps.\"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n for step in self._batch_manager._steps.values():\n if not self._is_step_running(step.step_name):\n continue\n if batch := step.get_batch():\n self._logger.debug(\n f\"Sending initial batch to '{step.step_name}' step: {batch}\"\n )\n self._send_batch_to_step(batch)\n\n for step_name in self.dag.root_steps:\n if not self._is_step_running(step_name):\n continue\n seq_no = 0\n if last_batch := self._batch_manager.get_last_batch(step_name):\n seq_no = last_batch.seq_no + 1\n batch = _Batch(seq_no=seq_no, step_name=step_name, last_batch=self._dry_run)\n self._logger.debug(\n f\"Requesting initial batch to '{step_name}' generator step: {batch}\"\n )\n self._send_batch_to_step(batch)\n\n def _request_batch_from_generator(self, step_name: str) -> None:\n \"\"\"Request a new batch to a `GeneratorStep`.\n\n Args:\n step_name: the name of the `GeneratorStep` to which a batch has to be requested.\n \"\"\"\n # Get the last batch that the previous step sent to generate the next batch\n # (next `seq_no`).\n last_batch = self._batch_manager.get_last_batch_sent(step_name) # type: ignore\n if last_batch is None:\n return\n self._send_batch_to_step(last_batch.next_batch())\n\n def _request_more_batches_if_needed(self, step: \"Step\") -> None:\n \"\"\"Request more batches to the predecessors steps of `step` if needed.\n\n Args:\n step: The step of which it has to be checked if more batches are needed from\n its predecessors.\n \"\"\"\n empty_buffers = self._batch_manager.step_empty_buffers(step.name) # type: ignore\n for previous_step_name in empty_buffers:\n # Only more batches can be requested to the `GeneratorStep`s as they are the\n # only kind of steps that lazily generate batches.\n if previous_step_name not in self.dag.root_steps:\n continue\n\n self._request_batch_from_generator(previous_step_name)\n\n def _handle_batch_on_stop(self, batch: \"_Batch\") -> None:\n \"\"\"Handles a batch that was received from the output queue when the pipeline was\n stopped. It will add and register the batch in the batch manager.\n\n Args:\n batch: The batch to handle.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n self._batch_manager.register_batch(\n batch, steps_data_path=self._cache_location[\"steps_data\"]\n )\n step: \"Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n for successor in self.dag.get_step_successors(step.name): # type: ignore\n self._batch_manager.add_batch(successor, batch)\n\n def _get_step_from_batch(self, batch: \"_Batch\") -> \"Step\":\n \"\"\"Gets the `Step` instance from a batch.\n\n Args:\n batch: The batch to get the step from.\n\n Returns:\n The `Step` instance.\n \"\"\"\n return self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n\n def _notify_steps_to_stop(self) -> None:\n \"\"\"Notifies the steps to stop their infinite running loop by sending `None` to\n their input queues.\"\"\"\n with self._steps_load_status_lock:\n for step_name, replicas in self._steps_load_status.items():\n if replicas > 0:\n for _ in range(replicas):\n self._send_to_step(step_name, None)\n\n def _get_successors(self, batch: \"_Batch\") -> Tuple[List[str], List[str], bool]:\n \"\"\"Gets the successors and the successors to which the batch has to be routed.\n\n Args:\n batch: The batch to which the successors will be determined.\n\n Returns:\n The successors to route the batch to and whether the batch was routed using\n a routing function.\n \"\"\"\n node = self.dag.get_step(batch.step_name)\n step: \"Step\" = node[constants.STEP_ATTR_NAME]\n successors = list(self.dag.get_step_successors(step.name)) # type: ignore\n route_to = successors\n\n # Check if the step has a routing function to send the batch to specific steps\n if routing_batch_function := node.get(\n constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n ):\n route_to = routing_batch_function(batch, successors)\n successors_str = \", \".join(f\"'{successor}'\" for successor in route_to)\n self._logger.info(\n f\"\ud83d\ude8f Using '{step.name}' routing function to send batch {batch.seq_no} to steps: {successors_str}\"\n )\n\n return route_to, list(set(successors) - set(route_to)), route_to != successors\n\n def _set_next_expected_seq_no(\n self, steps: List[str], from_step: str, next_expected_seq_no: int\n ) -> None:\n \"\"\"Sets the next expected sequence number of a `_Batch` received by `step` from\n `from_step`. This is necessary as some `Step`s might not receive all the batches\n comming from the previous steps because there is a routing batch function.\n\n Args:\n steps: list of steps to which the next expected sequence number of a `_Batch`\n from `from_step` has to be updated in the `_BatchManager`.\n from_step: the name of the step from which the next expected sequence number\n of a `_Batch` has to be updated in `steps`.\n next_expected_seq_no: the number of the next expected sequence number of a `Batch`\n from `from_step`.\n \"\"\"\n assert self._batch_manager, \"Batch manager is not set\"\n\n for step in steps:\n self._batch_manager.set_next_expected_seq_no(\n step_name=step,\n from_step=from_step,\n next_expected_seq_no=next_expected_seq_no,\n )\n\n @abstractmethod\n def _teardown(self) -> None:\n \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n pass\n\n @abstractmethod\n def _set_steps_not_loaded_exception(self) -> None:\n \"\"\"Used to raise `RuntimeError` when the load of the steps failed.\n\n Raises:\n RuntimeError: containing the information and why a step failed to be loaded.\n \"\"\"\n pass\n\n @abstractmethod\n def _stop(self) -> None:\n \"\"\"Stops the pipeline in a controlled way.\"\"\"\n pass\n\n def _stop_load_queue_loop(self) -> None:\n \"\"\"Stops the `_load_queue` loop sending a `None`.\"\"\"\n self._logger.debug(\"Sending `None` to the load queue to notify stop...\")\n self._load_queue.put(None)\n\n def _stop_output_queue_loop(self) -> None:\n \"\"\"Stops the `_output_queue` loop sending a `None`.\"\"\"\n self._logger.debug(\"Sending `None` to the output queue to notify stop...\")\n self._output_queue.put(None)\n\n def _handle_keyboard_interrupt(self) -> Any:\n \"\"\"Handles KeyboardInterrupt signal sent during the Pipeline.run method.\n\n It will try to call self._stop (if the pipeline didn't started yet, it won't\n have any effect), and if the pool is already started, will close it before exiting\n the program.\n\n Returns:\n The original `signal.SIGINT` handler.\n \"\"\"\n\n def signal_handler(signumber: int, frame: Any) -> None:\n self._stop()\n\n return signal.signal(signal.SIGINT, signal_handler)\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.signature","title":"signature: str property ","text":"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them. The main use is to find the pipeline in the cache folder. Returns: Type Description str Signature of the pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.aggregated_steps_signature","title":"aggregated_steps_signature: str property ","text":"Creates an aggregated signature using Step s signature that will be used for the _BatchManager . Returns: Type Description str The aggregated signature. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.QueueClass","title":"QueueClass: Callable abstractmethod property ","text":"The class of the queue to use in the pipeline. "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__init__","title":"__init__(name=None, description=None, cache_dir=None, enable_metadata=False, requirements=None) ","text":"Initialize the BasePipeline instance. Parameters: Name Type Description Default name Optional[str] The name of the pipeline. If not generated, a random one will be generated by default. None description Optional[str] A description of the pipeline. Defaults to None . None cache_dir Optional[Union[str, PathLike]] A directory where the pipeline will be cached. Defaults to None . None enable_metadata bool Whether to include the distilabel metadata column for the pipeline in the final Distiset . It contains metadata used by distilabel, for example the raw outputs of the LLM without processing would be here, inside raw_output_... field. Defaults to False . False requirements Optional[List[str]] List of requirements that must be installed to run the pipeline. Defaults to None , but can be helpful to inform in a pipeline to be shared that this requirements must be installed. None Source code in src/distilabel/pipeline/base.py def __init__(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n enable_metadata: bool = False,\n requirements: Optional[List[str]] = None,\n) -> None:\n \"\"\"Initialize the `BasePipeline` instance.\n\n Args:\n name: The name of the pipeline. If not generated, a random one will be generated by default.\n description: A description of the pipeline. Defaults to `None`.\n cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n enable_metadata: Whether to include the distilabel metadata column for the pipeline\n in the final `Distiset`. It contains metadata used by distilabel, for example\n the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n field. Defaults to `False`.\n requirements: List of requirements that must be installed to run the pipeline.\n Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n that this requirements must be installed.\n \"\"\"\n self.name = name or _PIPELINE_DEFAULT_NAME\n self.description = description\n self._enable_metadata = enable_metadata\n self.dag = DAG()\n\n if cache_dir:\n self._cache_dir = Path(cache_dir)\n elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n self._cache_dir = Path(env_cache_dir)\n else:\n self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n self._batch_manager: Optional[\"_BatchManager\"] = None\n self._write_buffer: Optional[\"_WriteBuffer\"] = None\n self._steps_input_queues: Dict[str, \"Queue\"] = {}\n\n self._steps_load_status: Dict[str, int] = {}\n self._steps_load_status_lock = threading.Lock()\n\n self._stop_called = False\n self._stop_called_lock = threading.Lock()\n self._stop_calls = 0\n\n self._recover_offline_batch_generate_for_step: Union[\n Tuple[str, List[List[Dict[str, Any]]]], None\n ] = None\n\n self._fs: Optional[fsspec.AbstractFileSystem] = None\n self._storage_base_path: Optional[str] = None\n self._use_fs_to_pass_data: bool = False\n self._dry_run = False\n\n self._current_stage = 0\n self._stages_last_batch: List[List[str]] = []\n self._load_groups = []\n\n self.requirements = requirements or []\n\n self._exception: Union[Exception, None] = None\n\n self._log_queue: Union[\"Queue[Any]\", None] = None\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__enter__","title":"__enter__() ","text":"Set the global pipeline instance when entering a pipeline context. Source code in src/distilabel/pipeline/base.py def __enter__(self) -> Self:\n \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(self)\n return self\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__exit__","title":"__exit__(exc_type, exc_value, traceback) ","text":"Unset the global pipeline instance when exiting a pipeline context. Source code in src/distilabel/pipeline/base.py def __exit__(self, exc_type, exc_value, traceback) -> None:\n \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n _GlobalPipelineManager.set_pipeline(None)\n self._set_pipeline_name()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.run","title":"run(parameters=None, load_groups=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None) ","text":"Run the pipeline. It will set the runtime parameters for the steps and validate the pipeline. This method should be extended by the specific pipeline implementation, adding the logic to run the pipeline. Parameters: Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None load_groups Optional[LoadGroups] A list containing lists of steps that have to be loaded together and in isolation with respect to the rest of the steps of the pipeline. This argument also allows passing the following modes: - \"sequential_step_execution\": each step will be executed in a stage i.e. the execution of the steps will be sequential.
Defaults to None . None use_cache bool Whether to use the cache from previous pipeline runs. Defaults to True . True storage_parameters Optional[Dict[str, Any]] A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batch es passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None . None use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . False dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None dataset_batch_size int if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset . Defaults to 50 . 50 logging_handlers Optional[List[Handler]] A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None . None Returns: Type Description Distiset The Distiset created by the pipeline. Source code in src/distilabel/pipeline/base.py def run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[logging.Handler]] = None,\n) -> \"Distiset\": # type: ignore\n \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n the pipeline.\n\n This method should be extended by the specific pipeline implementation,\n adding the logic to run the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n \"\"\"\n\n self._exception: Union[Exception, None] = None\n\n # Set the runtime parameters that will be used during the pipeline execution.\n # They are used to generate the signature of the pipeline that is used to hit the\n # cache when the pipeline is run, so it's important to do it first.\n self._set_runtime_parameters(parameters or {})\n\n self._refresh_pipeline_from_cache()\n\n if dataset is not None:\n self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n setup_logging(\n log_queue=self._log_queue,\n filename=str(self._cache_location[\"log_file\"]),\n logging_handlers=logging_handlers,\n )\n\n # Set the name of the pipeline if it's the default one. This should be called\n # if the pipeline is defined within the context manager, and the run is called\n # outside of it. Is here in the following case:\n # with Pipeline() as pipeline:\n # pipeline.run()\n self._set_pipeline_name()\n\n # Validate the pipeline DAG to check that all the steps are chainable, there are\n # no missing runtime parameters, batch sizes are correct, load groups are valid,\n # etc.\n self._load_groups = self._built_load_groups(load_groups)\n self._validate()\n\n self._set_pipeline_artifacts_path_in_steps()\n\n # Set the initial load status for all the steps\n self._init_steps_load_status()\n\n # Load the stages status or initialize it\n self._load_stages_status(use_cache)\n\n # Load the `_BatchManager` from cache or create one from scratch\n self._load_batch_manager(use_cache)\n\n # Check pipeline requirements are installed\n self._check_requirements()\n\n # Setup the filesystem that will be used to pass the data of the `_Batch`es\n self._setup_fsspec(storage_parameters)\n self._use_fs_to_pass_data = use_fs_to_pass_data\n\n if self._dry_run:\n self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n # If the batch manager is not able to generate batches, that means that the loaded\n # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n # the previous pipeline execution was completed successfully.\n if not self._batch_manager.can_generate(): # type: ignore\n self._logger.info(\n \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n \" Returning `Distiset` from cache data...\"\n )\n distiset = create_distiset(\n data_dir=self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n stop_logging()\n return distiset\n\n self._setup_write_buffer(use_cache)\n\n self._print_load_stages_info()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.dry_run","title":"dry_run(parameters=None, batch_size=1, dataset=None) ","text":"Do a dry run to test the pipeline runs as expected. Running a Pipeline in dry run mode will set all the batch_size of generator steps to the specified batch_size , and run just with a single batch, effectively running the whole pipeline with a single example. The cache will be set to False . Parameters: Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None batch_size int The batch size of the unique batch generated by the generators steps of the pipeline. Defaults to 1 . 1 dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None Returns: Type Description Distiset Will return the Distiset as the main run method would do. Source code in src/distilabel/pipeline/base.py def dry_run(\n self,\n parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n batch_size: int = 1,\n dataset: Optional[\"InputDataset\"] = None,\n) -> \"Distiset\":\n \"\"\"Do a dry run to test the pipeline runs as expected.\n\n Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n to the specified `batch_size`, and run just with a single batch, effectively\n running the whole pipeline with a single example. The cache will be set to `False`.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n batch_size: The batch size of the unique batch generated by the generators\n steps of the pipeline. Defaults to `1`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n\n Returns:\n Will return the `Distiset` as the main run method would do.\n \"\"\"\n self._dry_run = True\n\n for step_name in self.dag:\n step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n if step.is_generator:\n if not parameters:\n parameters = {}\n parameters[step_name] = {\"batch_size\": batch_size}\n\n distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n self._dry_run = False\n return distiset\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_load_stages","title":"get_load_stages(load_groups=None) ","text":"Convenient method to get the load stages of a pipeline. Parameters: Name Type Description Default load_groups Optional[LoadGroups] A list containing list of steps that has to be loaded together and in isolation with respect to the rest of the steps of the pipeline. Defaults to None . None Returns: Type Description LoadStages A tuple with the first element containing asorted list by stage containing LoadStages lists with the names of the steps of the stage, and the second element a list LoadStages sorted by stage containing lists with the names of the last steps of the stage. Source code in src/distilabel/pipeline/base.py def get_load_stages(self, load_groups: Optional[\"LoadGroups\"] = None) -> LoadStages:\n \"\"\"Convenient method to get the load stages of a pipeline.\n\n Args:\n load_groups: A list containing list of steps that has to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n Defaults to `None`.\n\n Returns:\n A tuple with the first element containing asorted list by stage containing\n lists with the names of the steps of the stage, and the second element a list\n sorted by stage containing lists with the names of the last steps of the stage.\n \"\"\"\n load_groups = self._built_load_groups(load_groups)\n return self.dag.get_steps_load_stages(load_groups)\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_runtime_parameters_info","title":"get_runtime_parameters_info() ","text":"Get the runtime parameters for the steps in the pipeline. Returns: Type Description PipelineRuntimeParametersInfo A dictionary with the step name as the key and a list of dictionaries with PipelineRuntimeParametersInfo the parameter name and the parameter info as the value. Source code in src/distilabel/pipeline/base.py def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n Returns:\n A dictionary with the step name as the key and a list of dictionaries with\n the parameter name and the parameter info as the value.\n \"\"\"\n runtime_parameters = {}\n for step_name in self.dag:\n step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n runtime_parameters[step_name] = step.get_runtime_parameters_info()\n return runtime_parameters\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.draw","title":"draw(path='pipeline.png', top_to_bottom=False, show_edge_labels=True) ","text":"Draws the pipeline. Parameters: Name Type Description Default path Optional[Union[str, Path]] The path to save the image to. 'pipeline.png' top_to_bottom bool Whether to draw the DAG top to bottom. Defaults to False . False show_edge_labels bool Whether to show the edge labels. Defaults to True . True Returns: Type Description str The path to the saved image. Source code in src/distilabel/pipeline/base.py def draw(\n self,\n path: Optional[Union[str, Path]] = \"pipeline.png\",\n top_to_bottom: bool = False,\n show_edge_labels: bool = True,\n) -> str:\n \"\"\"\n Draws the pipeline.\n\n Parameters:\n path: The path to save the image to.\n top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n Returns:\n The path to the saved image.\n \"\"\"\n png = self.dag.draw(\n top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n )\n with open(path, \"wb\") as f:\n f.write(png)\n return path\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__repr__","title":"__repr__() ","text":"If running in a Jupyter notebook, display an image representing this Pipeline . Source code in src/distilabel/pipeline/base.py def __repr__(self) -> str:\n \"\"\"\n If running in a Jupyter notebook, display an image representing this `Pipeline`.\n \"\"\"\n if in_notebook():\n try:\n from IPython.display import Image, display\n\n image_data = self.dag.draw()\n\n display(Image(image_data))\n except Exception:\n pass\n return super().__repr__()\n "},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.from_dict","title":"from_dict(data) classmethod ","text":"Create a Pipeline from a dict containing the serialized data. Note It's intended for internal use. Parameters: Name Type Description Default data Dict[str, Any] Dictionary containing the serialized data from a Pipeline. required Returns: Name Type Description BasePipeline Self Pipeline recreated from the dictionary info. Source code in src/distilabel/pipeline/base.py @classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n Note:\n It's intended for internal use.\n\n Args:\n data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n Returns:\n BasePipeline: Pipeline recreated from the dictionary info.\n \"\"\"\n name = data[\"pipeline\"][\"name\"]\n description = data[\"pipeline\"].get(\"description\")\n requirements = data.get(\"requirements\", [])\n with cls(name=name, description=description, requirements=requirements) as pipe:\n pipe.dag = DAG.from_dict(data[\"pipeline\"])\n return pipe\n "},{"location":"api/pipeline/#distilabel.pipeline.local","title":"local ","text":""},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline","title":"Pipeline ","text":" Bases: BasePipeline Local pipeline implementation using multiprocessing . Source code in src/distilabel/pipeline/local.py class Pipeline(BasePipeline):\n \"\"\"Local pipeline implementation using `multiprocessing`.\"\"\"\n\n def ray(\n self,\n ray_head_node_url: Optional[str] = None,\n ray_init_kwargs: Optional[Dict[str, Any]] = None,\n ) -> RayPipeline:\n \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n and it's mainly used by the CLI.\n\n Args:\n ray_head_node_url: The URL that can be used to connect to the head node of\n the Ray cluster. Normally, you won't want to use this argument as the\n recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n Defaults to `None`.\n ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n to `None`.\n\n Returns:\n A `RayPipeline` instance.\n \"\"\"\n pipeline = RayPipeline(\n name=self.name,\n description=self.description,\n cache_dir=self._cache_dir,\n enable_metadata=self._enable_metadata,\n requirements=self.requirements,\n ray_head_node_url=ray_head_node_url,\n ray_init_kwargs=ray_init_kwargs,\n )\n pipeline.dag = self.dag\n return pipeline\n\n def run(\n self,\n parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n ) -> \"Distiset\":\n \"\"\"Runs the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n\n Raises:\n RuntimeError: If the pipeline fails to load all the steps.\n \"\"\"\n if script_executed_in_ray_cluster():\n print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n return self.ray().run(\n parameters=parameters,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n )\n\n self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n if distiset := super().run(\n parameters=parameters,\n load_groups=load_groups,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n logging_handlers=logging_handlers,\n ):\n return distiset\n\n num_processes = self.dag.get_total_replica_count()\n with (\n mp.Manager() as manager,\n _NoDaemonPool(\n num_processes,\n initializer=_init_worker,\n initargs=(\n self._log_queue,\n self.name,\n self.signature,\n ),\n ) as pool,\n ):\n self._manager = manager\n self._pool = pool\n self._output_queue = self.QueueClass()\n self._load_queue = self.QueueClass()\n self._handle_keyboard_interrupt()\n\n # Run the loop for receiving the load status of each step\n self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n # Start a loop to receive the output batches from the steps\n self._output_queue_thread = self._run_output_queue_loop_in_thread()\n self._output_queue_thread.join()\n\n self._teardown()\n\n if self._exception:\n raise self._exception\n\n distiset = create_distiset(\n self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n\n stop_logging()\n\n return distiset\n\n @property\n def QueueClass(self) -> Callable:\n \"\"\"The callable used to create the input and output queues.\n\n Returns:\n The callable to create a `Queue`.\n \"\"\"\n assert self._manager, \"Manager is not initialized\"\n return self._manager.Queue\n\n def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n \"\"\"Runs the `Step` wrapped in a `_ProcessWrapper` in a separate process of the\n `Pool`.\n\n Args:\n step: The step to run.\n input_queue: The input queue to send the data to the step.\n replica: The replica ID assigned.\n \"\"\"\n assert self._pool, \"Pool is not initialized\"\n\n step_wrapper = _StepWrapper(\n step=step, # type: ignore\n replica=replica,\n input_queue=input_queue,\n output_queue=self._output_queue,\n load_queue=self._load_queue,\n dry_run=self._dry_run,\n ray_pipeline=False,\n )\n\n self._pool.apply_async(step_wrapper.run, error_callback=self._error_callback)\n\n def _error_callback(self, e: BaseException) -> None:\n \"\"\"Error callback that will be called when an error occurs in a `Step` process.\n\n Args:\n e: The exception raised by the process.\n \"\"\"\n global _SUBPROCESS_EXCEPTION\n\n # First we check that the exception is a `_StepWrapperException`, otherwise, we\n # print it out and stop the pipeline, since some errors may be unhandled\n if not isinstance(e, _StepWrapperException):\n self._logger.error(f\"\u274c Failed with an unhandled exception: {e}\")\n self._stop()\n return\n\n if e.is_load_error:\n self._logger.error(f\"\u274c Failed to load step '{e.step.name}': {e.message}\")\n _SUBPROCESS_EXCEPTION = e.subprocess_exception\n _SUBPROCESS_EXCEPTION.__traceback__ = tblib.Traceback.from_string( # type: ignore\n e.formatted_traceback\n ).as_traceback()\n return\n\n # If the step is global, is not in the last trophic level and has no successors,\n # then we can ignore the error and continue executing the pipeline\n step_name: str = e.step.name # type: ignore\n if (\n e.step.is_global\n and not self.dag.step_in_last_trophic_level(step_name)\n and list(self.dag.get_step_successors(step_name)) == []\n ):\n self._logger.error(\n f\"\u270b An error occurred when running global step '{step_name}' with no\"\n \" successors and not in the last trophic level. Pipeline execution can\"\n f\" continue. Error will be ignored.\"\n )\n self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n return\n\n # Handle tasks using an `LLM` using offline batch generation\n if isinstance(\n e.subprocess_exception, DistilabelOfflineBatchGenerationNotFinishedException\n ):\n self._logger.info(\n f\"\u23f9\ufe0f '{e.step.name}' task stopped pipeline execution: LLM offline batch\"\n \" generation in progress. Rerun pipeline with cache to check results and\"\n \" continue execution.\"\n )\n self._set_step_for_recovering_offline_batch_generation(e.step, e.data) # type: ignore\n with self._stop_called_lock:\n if not self._stop_called:\n self._stop(acquire_lock=False)\n return\n\n # Global step with successors failed\n self._logger.error(f\"An error occurred in global step '{step_name}'\")\n self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n\n self._stop()\n\n def _teardown(self) -> None:\n \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n if self._write_buffer:\n self._write_buffer.close()\n\n if self._batch_manager:\n self._batch_manager = None\n\n self._stop_load_queue_loop()\n self._load_steps_thread.join()\n\n if self._pool:\n self._pool.terminate()\n self._pool.join()\n\n if self._manager:\n self._manager.shutdown()\n self._manager.join()\n\n def _set_steps_not_loaded_exception(self) -> None:\n \"\"\"Raises a `RuntimeError` notifying that the steps load has failed.\n\n Raises:\n RuntimeError: containing the information and why a step failed to be loaded.\n \"\"\"\n self._exception = RuntimeError(\n \"Failed to load all the steps. Could not run pipeline.\"\n )\n self._exception.__cause__ = _SUBPROCESS_EXCEPTION\n\n def _stop(self, acquire_lock: bool = True) -> None:\n \"\"\"Stops the pipeline execution. It will first send `None` to the input queues\n of all the steps and then wait until the output queue is empty i.e. all the steps\n finished processing the batches that were sent before the stop flag. Then it will\n send `None` to the output queue to notify the pipeline to stop.\n\n Args:\n acquire_lock: Whether to acquire the lock to access the `_stop_called` attribute.\n \"\"\"\n\n if acquire_lock:\n self._stop_called_lock.acquire()\n\n if self._stop_called:\n self._stop_calls += 1\n if self._stop_calls == 1:\n self._logger.warning(\"\ud83d\uded1 Press again to force the pipeline to stop.\")\n elif self._stop_calls > 1:\n self._logger.warning(\"\ud83d\uded1 Forcing pipeline interruption.\")\n\n if self._pool:\n self._pool.terminate()\n self._pool.join()\n self._pool = None\n\n if self._manager:\n self._manager.shutdown()\n self._manager.join()\n self._manager = None\n\n stop_logging()\n\n sys.exit(1)\n\n return\n self._stop_called = True\n\n if acquire_lock:\n self._stop_called_lock.release()\n\n self._logger.debug(\n f\"Steps loaded before calling `stop`: {self._steps_load_status}\"\n )\n self._logger.info(\n \"\ud83d\uded1 Stopping pipeline. Waiting for steps to finish processing batches...\"\n )\n\n self._stop_output_queue_loop()\n "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.QueueClass","title":"QueueClass: Callable property ","text":"The callable used to create the input and output queues. Returns: Type Description Callable The callable to create a Queue . "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.ray","title":"ray(ray_head_node_url=None, ray_init_kwargs=None) ","text":"Creates a RayPipeline using the init parameters of this pipeline. This is a convenient method that can be used to \"transform\" one common Pipeline to a RayPipeline and it's mainly used by the CLI. Parameters: Name Type Description Default ray_head_node_url Optional[str] The URL that can be used to connect to the head node of the Ray cluster. Normally, you won't want to use this argument as the recommended way to submit a job to a Ray cluster is using the Ray Jobs CLI. Defaults to None . None ray_init_kwargs Optional[Dict[str, Any]] kwargs that will be passed to the ray.init method. Defaults to None . None Returns: Type Description RayPipeline A RayPipeline instance. Source code in src/distilabel/pipeline/local.py def ray(\n self,\n ray_head_node_url: Optional[str] = None,\n ray_init_kwargs: Optional[Dict[str, Any]] = None,\n) -> RayPipeline:\n \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n and it's mainly used by the CLI.\n\n Args:\n ray_head_node_url: The URL that can be used to connect to the head node of\n the Ray cluster. Normally, you won't want to use this argument as the\n recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n Defaults to `None`.\n ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n to `None`.\n\n Returns:\n A `RayPipeline` instance.\n \"\"\"\n pipeline = RayPipeline(\n name=self.name,\n description=self.description,\n cache_dir=self._cache_dir,\n enable_metadata=self._enable_metadata,\n requirements=self.requirements,\n ray_head_node_url=ray_head_node_url,\n ray_init_kwargs=ray_init_kwargs,\n )\n pipeline.dag = self.dag\n return pipeline\n "},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.run","title":"run(parameters=None, load_groups=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None) ","text":"Runs the pipeline. Parameters: Name Type Description Default parameters Optional[Dict[Any, Dict[str, Any]]] A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None . None load_groups Optional[LoadGroups] A list containing lists of steps that have to be loaded together and in isolation with respect to the rest of the steps of the pipeline. This argument also allows passing the following modes: - \"sequential_step_execution\": each step will be executed in a stage i.e. the execution of the steps will be sequential.
Defaults to None . None use_cache bool Whether to use the cache from previous pipeline runs. Defaults to True . True storage_parameters Optional[Dict[str, Any]] A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batch es passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None . None use_fs_to_pass_data bool Whether to use the file system to pass the data of the _Batch es between the steps. Even if this parameter is False , the Batch es received by GlobalStep s will always use the file system to pass the data. Defaults to False . False dataset Optional[InputDataset] If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None . None dataset_batch_size int if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset . Defaults to 50 . 50 logging_handlers Optional[List[Handler]] A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None . None Returns: Type Description Distiset The Distiset created by the pipeline. Raises: Type Description RuntimeError If the pipeline fails to load all the steps. Source code in src/distilabel/pipeline/local.py def run(\n self,\n parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n load_groups: Optional[\"LoadGroups\"] = None,\n use_cache: bool = True,\n storage_parameters: Optional[Dict[str, Any]] = None,\n use_fs_to_pass_data: bool = False,\n dataset: Optional[\"InputDataset\"] = None,\n dataset_batch_size: int = 50,\n logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n) -> \"Distiset\":\n \"\"\"Runs the pipeline.\n\n Args:\n parameters: A dictionary with the step name as the key and a dictionary with\n the runtime parameters for the step as the value. Defaults to `None`.\n load_groups: A list containing lists of steps that have to be loaded together\n and in isolation with respect to the rest of the steps of the pipeline.\n This argument also allows passing the following modes:\n\n - \"sequential_step_execution\": each step will be executed in a stage i.e.\n the execution of the steps will be sequential.\n\n Defaults to `None`.\n use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n `True`.\n storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n that will be used to store the data of the `_Batch`es passed between the\n steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n `GlobalStep` it will be always used). It must have at least the \"path\" key,\n and it can contain additional keys depending on the protocol. By default,\n it will use the local file system and a directory in the cache directory.\n Defaults to `None`.\n use_fs_to_pass_data: Whether to use the file system to pass the data of\n the `_Batch`es between the steps. Even if this parameter is `False`, the\n `Batch`es received by `GlobalStep`s will always use the file system to\n pass the data. Defaults to `False`.\n dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n root step. Convenient method when you have already processed the dataset in\n your script and just want to pass it already processed. Defaults to `None`.\n dataset_batch_size: if `dataset` is given, this will be the size of the batches\n yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n logging_handlers: A list of logging handlers that will be used to log the\n output of the pipeline. This argument can be useful so the logging messages\n can be extracted and used in a different context. Defaults to `None`.\n\n Returns:\n The `Distiset` created by the pipeline.\n\n Raises:\n RuntimeError: If the pipeline fails to load all the steps.\n \"\"\"\n if script_executed_in_ray_cluster():\n print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n return self.ray().run(\n parameters=parameters,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n )\n\n self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n if distiset := super().run(\n parameters=parameters,\n load_groups=load_groups,\n use_cache=use_cache,\n storage_parameters=storage_parameters,\n use_fs_to_pass_data=use_fs_to_pass_data,\n dataset=dataset,\n dataset_batch_size=dataset_batch_size,\n logging_handlers=logging_handlers,\n ):\n return distiset\n\n num_processes = self.dag.get_total_replica_count()\n with (\n mp.Manager() as manager,\n _NoDaemonPool(\n num_processes,\n initializer=_init_worker,\n initargs=(\n self._log_queue,\n self.name,\n self.signature,\n ),\n ) as pool,\n ):\n self._manager = manager\n self._pool = pool\n self._output_queue = self.QueueClass()\n self._load_queue = self.QueueClass()\n self._handle_keyboard_interrupt()\n\n # Run the loop for receiving the load status of each step\n self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n # Start a loop to receive the output batches from the steps\n self._output_queue_thread = self._run_output_queue_loop_in_thread()\n self._output_queue_thread.join()\n\n self._teardown()\n\n if self._exception:\n raise self._exception\n\n distiset = create_distiset(\n self._cache_location[\"data\"],\n pipeline_path=self._cache_location[\"pipeline\"],\n log_filename_path=self._cache_location[\"log_file\"],\n enable_metadata=self._enable_metadata,\n dag=self.dag,\n )\n\n stop_logging()\n\n return distiset\n "},{"location":"api/pipeline/routing_batch_function/","title":"Routing batch function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function","title":"routing_batch_function ","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunc","title":"RoutingBatchFunc = Callable[[List[str]], List[str]] module-attribute ","text":"Type alias for a routing batch function. It takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch. "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction","title":"RoutingBatchFunction ","text":" Bases: BaseModel , _Serializable A thin wrapper around a routing batch function that can be used to route batches from one upstream step to specific downstream steps. Attributes: Name Type Description routing_function RoutingBatchFunc The routing function that takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch. _step Union[_Step, None] The upstream step that is connected to the routing batch function. _routed_batch_registry Dict[str, Dict[int, List[str]]] A dictionary that keeps track of the batches that have been routed to specific downstream steps. Source code in src/distilabel/pipeline/routing_batch_function.py class RoutingBatchFunction(BaseModel, _Serializable):\n \"\"\"A thin wrapper around a routing batch function that can be used to route batches\n from one upstream step to specific downstream steps.\n\n Attributes:\n routing_function: The routing function that takes a list of all the downstream steps\n and returns a list with the names of the steps that should receive the batch.\n _step: The upstream step that is connected to the routing batch function.\n _routed_batch_registry: A dictionary that keeps track of the batches that have been\n routed to specific downstream steps.\n \"\"\"\n\n routing_function: RoutingBatchFunc\n description: Optional[str] = None\n\n _step: Union[\"_Step\", None] = PrivateAttr(default=None)\n _routed_batch_registry: Dict[str, Dict[int, List[str]]] = PrivateAttr(\n default_factory=dict\n )\n _factory_function_module: Union[str, None] = PrivateAttr(default=None)\n _factory_function_name: Union[str, None] = PrivateAttr(default=None)\n _factory_function_kwargs: Union[Dict[str, Any], None] = PrivateAttr(default=None)\n\n def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n routed_steps = self.routing_function(steps)\n self._register_routed_batch(batch, routed_steps)\n return routed_steps\n\n def set_factory_function(\n self,\n factory_function_module: str,\n factory_function_name: str,\n factory_function_kwargs: Dict[str, Any],\n ) -> None:\n \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n Args:\n factory_function_module: The module name where the factory function is defined.\n factory_function_name: The name of the factory function that was used to create\n the `routing_batch_function`.\n factory_function_kwargs: The keyword arguments that were used when calling the\n factory function.\n \"\"\"\n self._factory_function_module = factory_function_module\n self._factory_function_name = factory_function_name\n self._factory_function_kwargs = factory_function_kwargs\n\n def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n return self.route_batch(batch, steps)\n\n def _register_routed_batch(self, batch: \"_Batch\", routed_steps: List[str]) -> None:\n \"\"\"Registers a batch that has been routed to specific downstream steps.\n\n Args:\n batch: The batch that has been routed.\n routed_steps: The list of downstream steps that have been selected to receive\n the batch.\n \"\"\"\n upstream_step = batch.step_name\n batch_seq_no = batch.seq_no\n self._routed_batch_registry.setdefault(upstream_step, {}).setdefault(\n batch_seq_no, routed_steps\n )\n\n def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n ) -> List[\"DownstreamConnectableSteps\"]:\n \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n function.\n\n Args:\n other: A list of downstream steps that should be connected to the upstream step\n of the routing batch function.\n\n Returns:\n The list of downstream steps that have been connected to the upstream step of the\n routing batch function.\n \"\"\"\n if not isinstance(other, list):\n raise DistilabelUserError(\n f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n \" expression. It should be\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n if not self._step:\n raise DistilabelUserError(\n \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n \" steps before connecting the upstream step. Connect this routing batch\"\n \" function to an upstream step using the `>>` operator. For example:\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n for step in other:\n self._step.connect(step)\n return other\n\n def dump(self, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n factory function used to create this routing batch function.\n\n Args:\n **kwargs: Additional keyword arguments that should be included in the dump.\n\n Returns:\n A dictionary with the routing batch function information and the factory function\n information.\n \"\"\"\n dump_info: Dict[str, Any] = {\"step\": self._step.name} # type: ignore\n\n if self.description:\n dump_info[\"description\"] = self.description\n\n if type_info := self._get_type_info():\n dump_info[TYPE_INFO_KEY] = type_info\n\n return dump_info\n\n def _get_type_info(self) -> Dict[str, Any]:\n \"\"\"Returns the information of the factory function used to create the routing batch\n function.\n\n Returns:\n A dictionary with the factory function information.\n \"\"\"\n\n type_info = {}\n\n if self._factory_function_module:\n type_info[\"module\"] = self._factory_function_module\n\n if self._factory_function_name:\n type_info[\"name\"] = self._factory_function_name\n\n if self._factory_function_kwargs:\n type_info[\"kwargs\"] = self._factory_function_kwargs\n\n return type_info\n\n @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n of the factory function used to create the routing batch function.\n\n Args:\n data: A dictionary with the routing batch function information and the factory\n function information.\n \"\"\"\n type_info = data.get(TYPE_INFO_KEY)\n if not type_info:\n step = data.get(\"step\")\n raise ValueError(\n f\"The routing batch function for step '{step}' was created without a factory\"\n \" function, and it cannot be reconstructed.\"\n )\n\n module = type_info.get(\"module\")\n name = type_info.get(\"name\")\n kwargs = type_info.get(\"kwargs\")\n\n if not module or not name or not kwargs:\n raise ValueError(\n \"The routing batch function was created with a factory function, but the\"\n \" information is incomplete. Cannot reconstruct the routing batch function.\"\n )\n\n routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n routing_batch_function.description = data.get(\"description\")\n routing_batch_function.set_factory_function(\n factory_function_module=module,\n factory_function_name=name,\n factory_function_kwargs=kwargs,\n )\n\n return routing_batch_function\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.route_batch","title":"route_batch(batch, steps) ","text":"Returns a list of selected downstream steps from steps to which the batch should be routed. Parameters: Name Type Description Default batch _Batch The batch that should be routed. required steps List[str] A list of all the downstream steps that can receive the batch. required Returns: Type Description List[str] A list with the names of the steps that should receive the batch. Source code in src/distilabel/pipeline/routing_batch_function.py def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n routed_steps = self.routing_function(steps)\n self._register_routed_batch(batch, routed_steps)\n return routed_steps\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.set_factory_function","title":"set_factory_function(factory_function_module, factory_function_name, factory_function_kwargs) ","text":"Sets the factory function that was used to create the routing_batch_function . Parameters: Name Type Description Default factory_function_module str The module name where the factory function is defined. required factory_function_name str The name of the factory function that was used to create the routing_batch_function . required factory_function_kwargs Dict[str, Any] The keyword arguments that were used when calling the factory function. required Source code in src/distilabel/pipeline/routing_batch_function.py def set_factory_function(\n self,\n factory_function_module: str,\n factory_function_name: str,\n factory_function_kwargs: Dict[str, Any],\n) -> None:\n \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n Args:\n factory_function_module: The module name where the factory function is defined.\n factory_function_name: The name of the factory function that was used to create\n the `routing_batch_function`.\n factory_function_kwargs: The keyword arguments that were used when calling the\n factory function.\n \"\"\"\n self._factory_function_module = factory_function_module\n self._factory_function_name = factory_function_name\n self._factory_function_kwargs = factory_function_kwargs\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__call__","title":"__call__(batch, steps) ","text":"Returns a list of selected downstream steps from steps to which the batch should be routed. Parameters: Name Type Description Default batch _Batch The batch that should be routed. required steps List[str] A list of all the downstream steps that can receive the batch. required Returns: Type Description List[str] A list with the names of the steps that should receive the batch. Source code in src/distilabel/pipeline/routing_batch_function.py def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n should be routed.\n\n Args:\n batch: The batch that should be routed.\n steps: A list of all the downstream steps that can receive the batch.\n\n Returns:\n A list with the names of the steps that should receive the batch.\n \"\"\"\n return self.route_batch(batch, steps)\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__rshift__","title":"__rshift__(other) ","text":"Connects a list of dowstream steps to the upstream step of the routing batch function. Parameters: Name Type Description Default other List[DownstreamConnectableSteps] A list of downstream steps that should be connected to the upstream step of the routing batch function. required Returns: Type Description List[DownstreamConnectableSteps] The list of downstream steps that have been connected to the upstream step of the List[DownstreamConnectableSteps] routing batch function. Source code in src/distilabel/pipeline/routing_batch_function.py def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n) -> List[\"DownstreamConnectableSteps\"]:\n \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n function.\n\n Args:\n other: A list of downstream steps that should be connected to the upstream step\n of the routing batch function.\n\n Returns:\n The list of downstream steps that have been connected to the upstream step of the\n routing batch function.\n \"\"\"\n if not isinstance(other, list):\n raise DistilabelUserError(\n f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n \" expression. It should be\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n if not self._step:\n raise DistilabelUserError(\n \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n \" steps before connecting the upstream step. Connect this routing batch\"\n \" function to an upstream step using the `>>` operator. For example:\"\n \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n )\n\n for step in other:\n self._step.connect(step)\n return other\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.dump","title":"dump(**kwargs) ","text":"Dumps the routing batch function to a dictionary, and the information of the factory function used to create this routing batch function. Parameters: Name Type Description Default **kwargs Any Additional keyword arguments that should be included in the dump. {} Returns: Type Description Dict[str, Any] A dictionary with the routing batch function information and the factory function Dict[str, Any] information. Source code in src/distilabel/pipeline/routing_batch_function.py def dump(self, **kwargs: Any) -> Dict[str, Any]:\n \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n factory function used to create this routing batch function.\n\n Args:\n **kwargs: Additional keyword arguments that should be included in the dump.\n\n Returns:\n A dictionary with the routing batch function information and the factory function\n information.\n \"\"\"\n dump_info: Dict[str, Any] = {\"step\": self._step.name} # type: ignore\n\n if self.description:\n dump_info[\"description\"] = self.description\n\n if type_info := self._get_type_info():\n dump_info[TYPE_INFO_KEY] = type_info\n\n return dump_info\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.from_dict","title":"from_dict(data) classmethod ","text":"Loads a routing batch function from a dictionary. It must contain the information of the factory function used to create the routing batch function. Parameters: Name Type Description Default data Dict[str, Any] A dictionary with the routing batch function information and the factory function information. required Source code in src/distilabel/pipeline/routing_batch_function.py @classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n of the factory function used to create the routing batch function.\n\n Args:\n data: A dictionary with the routing batch function information and the factory\n function information.\n \"\"\"\n type_info = data.get(TYPE_INFO_KEY)\n if not type_info:\n step = data.get(\"step\")\n raise ValueError(\n f\"The routing batch function for step '{step}' was created without a factory\"\n \" function, and it cannot be reconstructed.\"\n )\n\n module = type_info.get(\"module\")\n name = type_info.get(\"name\")\n kwargs = type_info.get(\"kwargs\")\n\n if not module or not name or not kwargs:\n raise ValueError(\n \"The routing batch function was created with a factory function, but the\"\n \" information is incomplete. Cannot reconstruct the routing batch function.\"\n )\n\n routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n routing_batch_function.description = data.get(\"description\")\n routing_batch_function.set_factory_function(\n factory_function_module=module,\n factory_function_name=name,\n factory_function_kwargs=kwargs,\n )\n\n return routing_batch_function\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.routing_batch_function","title":"routing_batch_function(description=None) ","text":"Creates a routing batch function that can be used to route batches from one upstream step to specific downstream steps. Parameters: Name Type Description Default description Optional[str] An optional description for the routing batch function. None Returns: Type Description Callable[[RoutingBatchFunc], RoutingBatchFunction] A RoutingBatchFunction instance that can be used with the >> operators and with Callable[[RoutingBatchFunc], RoutingBatchFunction] the Pipeline.connect method when defining the pipeline. Example: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n@routing_batch_function\ndef random_routing_batch(steps: List[str]) -> List[str]:\n return random.sample(steps, 2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n Source code in src/distilabel/pipeline/routing_batch_function.py def routing_batch_function(\n description: Optional[str] = None,\n) -> Callable[[RoutingBatchFunc], RoutingBatchFunction]:\n \"\"\"Creates a routing batch function that can be used to route batches from one upstream\n step to specific downstream steps.\n\n Args:\n description: An optional description for the routing batch function.\n\n Returns:\n A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n the `Pipeline.connect` method when defining the pipeline.\n\n Example:\n\n ```python\n from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n from distilabel.pipeline import Pipeline, routing_batch_function\n from distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n @routing_batch_function\n def random_routing_batch(steps: List[str]) -> List[str]:\n return random.sample(steps, 2)\n\n\n with Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n ```\n \"\"\"\n\n def decorator(func: RoutingBatchFunc) -> RoutingBatchFunction:\n factory_function_name, factory_function_module, factory_function_kwargs = (\n None,\n None,\n None,\n )\n\n # Check if `routing_batch_function` was created using a factory function from an installed package\n stack = inspect.stack()\n if len(stack) > 2:\n factory_function_frame_info = stack[1]\n\n # Function factory path\n if factory_function_frame_info.function != \"<module>\":\n factory_function_name = factory_function_frame_info.function\n factory_function_module = inspect.getmodule(\n factory_function_frame_info.frame\n ).__name__ # type: ignore\n\n # Function factory kwargs\n factory_function_kwargs = factory_function_frame_info.frame.f_locals\n\n routing_batch_function = RoutingBatchFunction(\n routing_function=func,\n description=description,\n )\n\n if (\n factory_function_module\n and factory_function_name\n and factory_function_kwargs\n ):\n routing_batch_function.set_factory_function(\n factory_function_module=factory_function_module,\n factory_function_name=factory_function_name,\n factory_function_kwargs=factory_function_kwargs,\n )\n\n return routing_batch_function\n\n return decorator\n "},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.sample_n_steps","title":"sample_n_steps(n) ","text":"A simple function that creates a routing batch function that samples n steps from the list of all the downstream steps. Parameters: Name Type Description Default n int The number of steps to sample from the list of all the downstream steps. required Returns: Type Description RoutingBatchFunction A RoutingBatchFunction instance that can be used with the >> operators and with RoutingBatchFunction the Pipeline.connect method when defining the pipeline. Example: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\nrandom_routing_batch = sample_n_steps(2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n Source code in src/distilabel/pipeline/routing_batch_function.py def sample_n_steps(n: int) -> RoutingBatchFunction:\n \"\"\"A simple function that creates a routing batch function that samples `n` steps from\n the list of all the downstream steps.\n\n Args:\n n: The number of steps to sample from the list of all the downstream steps.\n\n Returns:\n A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n the `Pipeline.connect` method when defining the pipeline.\n\n Example:\n\n ```python\n from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n from distilabel.pipeline import Pipeline, sample_n_steps\n from distilabel.steps import LoadDataFromHub, GroupColumns\n\n random_routing_batch = sample_n_steps(2)\n\n\n with Pipeline(name=\"routing-batch-function\") as pipeline:\n load_data = LoadDataFromHub()\n\n generations = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n generations.append(task)\n\n combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n load_data >> random_routing_batch >> generations >> combine_columns\n ```\n \"\"\"\n\n @routing_batch_function(\n description=f\"Sample {n} steps from the list of downstream steps.\"\n )\n def sample_n(steps: List[str]) -> List[str]:\n return random.sample(steps, n)\n\n return sample_n\n "},{"location":"api/pipeline/step_wrapper/","title":"Step Wrapper","text":""},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper","title":"_StepWrapper ","text":"Wrapper to run the Step . Attributes: Name Type Description step The step to run. replica The replica ID assigned. input_queue The queue to receive the input data. output_queue The queue to send the output data. load_queue The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load. Source code in src/distilabel/pipeline/step_wrapper.py class _StepWrapper:\n \"\"\"Wrapper to run the `Step`.\n\n Attributes:\n step: The step to run.\n replica: The replica ID assigned.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been loaded,\n has been unloaded or has failed to load.\n \"\"\"\n\n def __init__(\n self,\n step: Union[\"Step\", \"GeneratorStep\"],\n replica: int,\n input_queue: \"Queue[_Batch]\",\n output_queue: \"Queue[_Batch]\",\n load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n dry_run: bool = False,\n ray_pipeline: bool = False,\n ) -> None:\n \"\"\"Initializes the `_ProcessWrapper`.\n\n Args:\n step: The step to run.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been\n loaded, has been unloaded or has failed to load.\n dry_run: Flag to ensure we are forcing to run the last batch.\n ray_pipeline: Whether the step is running a `RayPipeline` or not.\n \"\"\"\n self.step = step\n self.replica = replica\n self.input_queue = input_queue\n self.output_queue = output_queue\n self.load_queue = load_queue\n self.dry_run = dry_run\n self.ray_pipeline = ray_pipeline\n\n self._init_cuda_device_placement()\n\n def _init_cuda_device_placement(self) -> None:\n \"\"\"Sets the LLM identifier and the number of desired GPUs of the `CudaDevicePlacementMixin`\"\"\"\n\n def _init_cuda_device_placement_mixin(attr: CudaDevicePlacementMixin) -> None:\n if self.ray_pipeline:\n attr.disable_cuda_device_placement = True\n else:\n desired_num_gpus = self.step.resources.gpus or 1\n attr._llm_identifier = f\"{self.step.name}-replica-{self.replica}\"\n attr._desired_num_gpus = desired_num_gpus\n\n for field_name in self.step.model_fields_set:\n attr = getattr(self.step, field_name)\n if isinstance(attr, CudaDevicePlacementMixin):\n _init_cuda_device_placement_mixin(attr)\n\n if isinstance(self.step, CudaDevicePlacementMixin):\n _init_cuda_device_placement_mixin(self.step)\n\n def run(self) -> str:\n \"\"\"The target function executed by the process. This function will also handle\n the step lifecycle, executing first the `load` function of the `Step` and then\n waiting to receive a batch from the `input_queue` that will be handled by the\n `process` method of the `Step`.\n\n Returns:\n The name of the step that was executed.\n \"\"\"\n\n try:\n self.step.load()\n self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n except Exception as e:\n self.step.unload()\n self._notify_load_failed()\n raise _StepWrapperException.create_load_error(\n message=f\"Step load failed: {e}\",\n step=self.step,\n subprocess_exception=e,\n ) from e\n\n self._notify_load()\n\n if self.step.is_generator:\n self._generator_step_process_loop()\n else:\n self._non_generator_process_loop()\n\n # Just in case `None` sentinel was sent\n try:\n self.input_queue.get(block=False)\n except Exception:\n pass\n\n self.step.unload()\n\n self._notify_unload()\n\n self.step._logger.info(\n f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n )\n\n return self.step.name # type: ignore\n\n def _notify_load(self) -> None:\n \"\"\"Notifies that the step has finished executing its `load` function successfully.\"\"\"\n self.step._logger.debug(\n f\"Notifying load of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"loaded\"}) # type: ignore\n\n def _notify_unload(self) -> None:\n \"\"\"Notifies that the step has been unloaded.\"\"\"\n self.step._logger.debug(\n f\"Notifying unload of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"unloaded\"}) # type: ignore\n\n def _notify_load_failed(self) -> None:\n \"\"\"Notifies that the step failed to load.\"\"\"\n self.step._logger.debug(\n f\"Notifying load failed of step '{self.step.name}' (replica ID {self.replica})...\"\n )\n self.load_queue.put({\"name\": self.step.name, \"status\": \"load_failed\"}) # type: ignore\n\n def _generator_step_process_loop(self) -> None:\n \"\"\"Runs the process loop for a generator step. It will call the `process` method\n of the step and send the output data to the `output_queue` and block until the next\n batch request is received (i.e. receiving an empty batch from the `input_queue`).\n\n If the `last_batch` attribute of the batch is `True`, the loop will stop and the\n process will finish.\n\n Raises:\n _StepWrapperException: If an error occurs during the execution of the\n `process` method.\n \"\"\"\n step = cast(\"GeneratorStep\", self.step)\n\n try:\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n )\n return\n\n offset = batch.seq_no * step.batch_size # type: ignore\n\n self.step._logger.info(\n f\"\ud83d\udeb0 Starting yielding batches from generator step '{self.step.name}'.\"\n f\" Offset: {offset}\"\n )\n\n for data, last_batch in step.process_applying_mappings(offset=offset):\n batch.set_data([data])\n batch.last_batch = self.dry_run or last_batch\n self._send_batch(batch)\n\n if batch.last_batch:\n return\n\n self.step._logger.debug(\n f\"Step '{self.step.name}' waiting for next batch request...\"\n )\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n )\n return\n except Exception as e:\n raise _StepWrapperException(str(e), self.step, 2, e) from e\n\n def _non_generator_process_loop(self) -> None:\n \"\"\"Runs the process loop for a non-generator step. It will call the `process`\n method of the step and send the output data to the `output_queue` and block until\n the next batch is received from the `input_queue`. If the `last_batch` attribute\n of the batch is `True`, the loop will stop and the process will finish.\n\n If an error occurs during the execution of the `process` method and the step is\n global, the process will raise a `_StepWrapperException`. If the step is not\n global, the process will log the error and send an empty batch to the `output_queue`.\n\n Raises:\n _StepWrapperException: If an error occurs during the execution of the\n `process` method and the step is global.\n \"\"\"\n step = cast(\"Step\", self.step)\n while True:\n if (batch := self.input_queue.get()) is None:\n self.step._logger.info(\n f\"\ud83d\uded1 Stopping processing batches from step '{self.step.name}'\"\n )\n break\n\n if batch == LAST_BATCH_SENT_FLAG:\n self.step._logger.debug(\"Received `LAST_BATCH_SENT_FLAG`. Stopping...\")\n break\n\n self.step._logger.info(\n f\"\ud83d\udce6 Processing batch {batch.seq_no} in '{batch.step_name}' (replica ID: {self.replica})\"\n )\n\n if batch.data_path is not None:\n self.step._logger.debug(f\"Reading batch data from '{batch.data_path}'\")\n batch.read_batch_data_from_fs()\n\n result = []\n try:\n if self.step.has_multiple_inputs:\n result = next(step.process_applying_mappings(*batch.data))\n else:\n result = next(step.process_applying_mappings(batch.data[0]))\n except Exception as e:\n if self.step.is_global:\n self.step.unload()\n self._notify_unload()\n data = (\n batch.data\n if isinstance(\n e, DistilabelOfflineBatchGenerationNotFinishedException\n )\n else None\n )\n raise _StepWrapperException(str(e), self.step, 2, e, data) from e\n\n # Impute step outputs columns with `None`\n result = self._impute_step_outputs(batch)\n\n # if the step is not global then we can skip the batch which means sending\n # an empty batch to the output queue\n self.step._logger.warning(\n f\"\u26a0\ufe0f Processing batch {batch.seq_no} with step '{self.step.name}' failed.\"\n \" Sending empty batch filled with `None`s...\"\n )\n self.step._logger.warning(\n f\"Subprocess traceback:\\n\\n{traceback.format_exc()}\"\n )\n finally:\n batch.set_data([result])\n self._send_batch(batch)\n\n if batch.last_batch:\n break\n\n def _impute_step_outputs(self, batch: \"_Batch\") -> List[Dict[str, Any]]:\n \"\"\"Imputes the step outputs columns with `None` in the batch data.\n\n Args:\n batch: The batch to impute.\n \"\"\"\n return self.step.impute_step_outputs(batch.data[0])\n\n def _send_batch(self, batch: _Batch) -> None:\n \"\"\"Sends a batch to the `output_queue`.\"\"\"\n if batch.data_path is not None:\n self.step._logger.debug(f\"Writing batch data to '{batch.data_path}'\")\n batch.write_batch_data_to_fs()\n\n self.step._logger.info(\n f\"\ud83d\udce8 Step '{batch.step_name}' sending batch {batch.seq_no} to output queue\"\n )\n self.output_queue.put(batch)\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.__init__","title":"__init__(step, replica, input_queue, output_queue, load_queue, dry_run=False, ray_pipeline=False) ","text":"Initializes the _ProcessWrapper . Parameters: Name Type Description Default step Union[Step, GeneratorStep] The step to run. required input_queue Queue[_Batch] The queue to receive the input data. required output_queue Queue[_Batch] The queue to send the output data. required load_queue Queue[Union[StepLoadStatus, None]] The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load. required dry_run bool Flag to ensure we are forcing to run the last batch. False ray_pipeline bool Whether the step is running a RayPipeline or not. False Source code in src/distilabel/pipeline/step_wrapper.py def __init__(\n self,\n step: Union[\"Step\", \"GeneratorStep\"],\n replica: int,\n input_queue: \"Queue[_Batch]\",\n output_queue: \"Queue[_Batch]\",\n load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n dry_run: bool = False,\n ray_pipeline: bool = False,\n) -> None:\n \"\"\"Initializes the `_ProcessWrapper`.\n\n Args:\n step: The step to run.\n input_queue: The queue to receive the input data.\n output_queue: The queue to send the output data.\n load_queue: The queue used to notify the main process that the step has been\n loaded, has been unloaded or has failed to load.\n dry_run: Flag to ensure we are forcing to run the last batch.\n ray_pipeline: Whether the step is running a `RayPipeline` or not.\n \"\"\"\n self.step = step\n self.replica = replica\n self.input_queue = input_queue\n self.output_queue = output_queue\n self.load_queue = load_queue\n self.dry_run = dry_run\n self.ray_pipeline = ray_pipeline\n\n self._init_cuda_device_placement()\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.run","title":"run() ","text":"The target function executed by the process. This function will also handle the step lifecycle, executing first the load function of the Step and then waiting to receive a batch from the input_queue that will be handled by the process method of the Step . Returns: Type Description str The name of the step that was executed. Source code in src/distilabel/pipeline/step_wrapper.py def run(self) -> str:\n \"\"\"The target function executed by the process. This function will also handle\n the step lifecycle, executing first the `load` function of the `Step` and then\n waiting to receive a batch from the `input_queue` that will be handled by the\n `process` method of the `Step`.\n\n Returns:\n The name of the step that was executed.\n \"\"\"\n\n try:\n self.step.load()\n self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n except Exception as e:\n self.step.unload()\n self._notify_load_failed()\n raise _StepWrapperException.create_load_error(\n message=f\"Step load failed: {e}\",\n step=self.step,\n subprocess_exception=e,\n ) from e\n\n self._notify_load()\n\n if self.step.is_generator:\n self._generator_step_process_loop()\n else:\n self._non_generator_process_loop()\n\n # Just in case `None` sentinel was sent\n try:\n self.input_queue.get(block=False)\n except Exception:\n pass\n\n self.step.unload()\n\n self._notify_unload()\n\n self.step._logger.info(\n f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n )\n\n return self.step.name # type: ignore\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException","title":"_StepWrapperException ","text":" Bases: Exception Exception to be raised when an error occurs in the _StepWrapper class. Attributes: Name Type Description message The error message. step The Step that raised the error. code The error code. subprocess_exception The exception raised by the subprocess. data The data that caused the error. Defaults to None . Source code in src/distilabel/pipeline/step_wrapper.py class _StepWrapperException(Exception):\n \"\"\"Exception to be raised when an error occurs in the `_StepWrapper` class.\n\n Attributes:\n message: The error message.\n step: The `Step` that raised the error.\n code: The error code.\n subprocess_exception: The exception raised by the subprocess.\n data: The data that caused the error. Defaults to `None`.\n \"\"\"\n\n def __init__(\n self,\n message: str,\n step: \"_Step\",\n code: int,\n subprocess_exception: Exception,\n data: Optional[List[List[Dict[str, Any]]]] = None,\n ) -> None:\n self.message = f\"{message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}api/pipeline/step_wrapper'\"\n self.step = step\n self.code = code\n self.subprocess_exception = subprocess_exception\n self.formatted_traceback = \"\".join(\n traceback.format_exception(\n type(subprocess_exception),\n subprocess_exception,\n subprocess_exception.__traceback__,\n )\n )\n self.data = data\n\n @classmethod\n def create_load_error(\n cls,\n message: str,\n step: \"_Step\",\n subprocess_exception: Optional[Exception] = None,\n ) -> \"_StepWrapperException\":\n \"\"\"Creates a `_StepWrapperException` for a load error.\n\n Args:\n message: The error message.\n step: The `Step` that raised the error.\n subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n Returns:\n The `_StepWrapperException` instance.\n \"\"\"\n return cls(message, step, 1, subprocess_exception, None)\n\n @property\n def is_load_error(self) -> bool:\n \"\"\"Whether the error is a load error.\n\n Returns:\n `True` if the error is a load error, `False` otherwise.\n \"\"\"\n return self.code == 1\n "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.is_load_error","title":"is_load_error: bool property ","text":"Whether the error is a load error. Returns: Type Description bool True if the error is a load error, False otherwise. "},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.create_load_error","title":"create_load_error(message, step, subprocess_exception=None) classmethod ","text":"Creates a _StepWrapperException for a load error. Parameters: Name Type Description Default message str The error message. required step _Step The Step that raised the error. required subprocess_exception Optional[Exception] The exception raised by the subprocess. Defaults to None . None Returns: Type Description _StepWrapperException The _StepWrapperException instance. Source code in src/distilabel/pipeline/step_wrapper.py @classmethod\ndef create_load_error(\n cls,\n message: str,\n step: \"_Step\",\n subprocess_exception: Optional[Exception] = None,\n) -> \"_StepWrapperException\":\n \"\"\"Creates a `_StepWrapperException` for a load error.\n\n Args:\n message: The error message.\n step: The `Step` that raised the error.\n subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n Returns:\n The `_StepWrapperException` instance.\n \"\"\"\n return cls(message, step, 1, subprocess_exception, None)\n "},{"location":"api/pipeline/typing/","title":"Pipeline Typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing","title":"typing ","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectable","title":"DownstreamConnectable = Union['Step', 'GlobalStep'] module-attribute ","text":"Alias for the Step types that can be connected as downstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.UpstreamConnectableSteps","title":"UpstreamConnectableSteps = TypeVar('UpstreamConnectableSteps', bound=Union['Step', 'GlobalStep', 'GeneratorStep']) module-attribute ","text":"Type for the Step types that can be connected as upstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectableSteps","title":"DownstreamConnectableSteps = TypeVar('DownstreamConnectableSteps', bound=DownstreamConnectable, covariant=True) module-attribute ","text":"Type for the Step types that can be connected as downstream steps. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.PipelineRuntimeParametersInfo","title":"PipelineRuntimeParametersInfo = Dict[str, Union[List['RuntimeParameterInfo'], Dict[str, 'RuntimeParameterInfo']]] module-attribute ","text":"Alias for the information of the runtime parameters of a Pipeline . "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.InputDataset","title":"InputDataset = Union['Dataset', 'pd.DataFrame', List[Dict[str, str]]] module-attribute ","text":"Alias for the types we can process as input dataset. "},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.LoadGroups","title":"LoadGroups = Union[List[List[Any]], Literal['sequential_step_execution']] module-attribute ","text":"Alias for the types that can be used as load groups. - if
List[List[Any]] , it's a list containing lists of steps that have to be loaded in isolation. - if \"sequential_step_execution\", each step will be loaded in a different stage i.e. only one step will be executed at a time.
"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.StepLoadStatus","title":"StepLoadStatus ","text":" Bases: TypedDict Dict containing information about if one step was loaded/unloaded or if it's load failed Source code in src/distilabel/pipeline/typing.py class StepLoadStatus(TypedDict):\n \"\"\"Dict containing information about if one step was loaded/unloaded or if it's load\n failed\"\"\"\n\n name: str\n status: Literal[\"loaded\", \"unloaded\", \"load_failed\"]\n "},{"location":"api/step/","title":"Step","text":"This section contains the API reference for the distilabel step, both for the _Step base class and the Step class. For more information and examples on how to use existing steps or create custom ones, please refer to Tutorial - Step. "},{"location":"api/step/#distilabel.steps.base","title":"base ","text":""},{"location":"api/step/#distilabel.steps.base.StepInput","title":"StepInput = Annotated[List[Dict[str, Any]], _STEP_INPUT_ANNOTATION] module-attribute ","text":"StepInput is just an Annotated alias of the typing List[Dict[str, Any]] with extra metadata that allows distilabel to perform validations over the process step method defined in each Step "},{"location":"api/step/#distilabel.steps.base._Step","title":"_Step ","text":" Bases: RuntimeParametersMixin , RequirementsMixin , SignatureMixin , BaseModel , _Serializable , ABC Base class for the steps that can be included in a Pipeline . A Step is a class defining some processing logic. The input and outputs for this processing logic are lists of dictionaries with the same keys: ```python\n[\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n]\n```\n The processing logic is defined in the process method, which depending on the number of previous steps, can receive more than one list of dictionaries, each with the output of the previous steps. In order to make distilabel know where the outputs from the previous steps are, the process function from each Step must have an argument or positional argument annotated with StepInput . ```python\nclass StepWithOnePreviousStep(Step):\n def process(self, inputs: StepInput) -> StepOutput:\n yield [...]\n\nclass StepWithSeveralPreviousStep(Step):\n # mind the * to indicate that the argument is a list of StepInput\n def process(self, *inputs: StepInput) -> StepOutput:\n yield [...]\n```\n In order to perform static validations and to check that the chaining of the steps in the pipeline is valid, a Step must also define the inputs and outputs properties: inputs : a list of strings with the names of the columns that the step needs as input. It can be an empty list if the step is a generator step. outputs : a list of strings with the names of the columns that the step will produce as output. Optionally, a Step can override the load method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Finally, the Step class inherits from pydantic.BaseModel , so attributes can be easily defined, validated, serialized and included in the __init__ method of the step. Source code in src/distilabel/steps/base.py class _Step(\n RuntimeParametersMixin,\n RequirementsMixin,\n SignatureMixin,\n BaseModel,\n _Serializable,\n ABC,\n):\n \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n A `Step` is a class defining some processing logic. The input and outputs for this\n processing logic are lists of dictionaries with the same keys:\n\n ```python\n [\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n ]\n ```\n\n The processing logic is defined in the `process` method, which depending on the\n number of previous steps, can receive more than one list of dictionaries, each with\n the output of the previous steps. In order to make `distilabel` know where the outputs\n from the previous steps are, the `process` function from each `Step` must have an argument\n or positional argument annotated with `StepInput`.\n\n ```python\n class StepWithOnePreviousStep(Step):\n def process(self, inputs: StepInput) -> StepOutput:\n yield [...]\n\n class StepWithSeveralPreviousStep(Step):\n # mind the * to indicate that the argument is a list of StepInput\n def process(self, *inputs: StepInput) -> StepOutput:\n yield [...]\n ```\n\n In order to perform static validations and to check that the chaining of the steps\n in the pipeline is valid, a `Step` must also define the `inputs` and `outputs`\n properties:\n\n - `inputs`: a list of strings with the names of the columns that the step needs as\n input. It can be an empty list if the step is a generator step.\n - `outputs`: a list of strings with the names of the columns that the step will\n produce as output.\n\n Optionally, a `Step` can override the `load` method to perform any initialization\n logic before the `process` method is called. For example, to load an LLM, stablish a\n connection to a database, etc.\n\n Finally, the `Step` class inherits from `pydantic.BaseModel`, so attributes can be easily\n defined, validated, serialized and included in the `__init__` method of the step.\n \"\"\"\n\n model_config = ConfigDict(\n arbitrary_types_allowed=True,\n validate_default=True,\n validate_assignment=True,\n extra=\"forbid\",\n )\n\n name: Optional[str] = Field(default=None, pattern=r\"^[a-zA-Z0-9_-]+$\")\n resources: StepResources = StepResources()\n pipeline: Any = Field(default=None, exclude=True, repr=False)\n input_mappings: Dict[str, str] = {}\n output_mappings: Dict[str, str] = {}\n use_cache: bool = True\n\n _pipeline_artifacts_path: Path = PrivateAttr(None)\n _built_from_decorator: bool = PrivateAttr(default=False)\n _logger: \"Logger\" = PrivateAttr(None)\n\n def model_post_init(self, __context: Any) -> None:\n from distilabel.pipeline.base import _GlobalPipelineManager\n\n super().model_post_init(__context)\n\n if self.pipeline is None:\n self.pipeline = _GlobalPipelineManager.get_pipeline()\n\n if self.pipeline is None:\n _logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n _logger.warning(\n f\"Step '{self.name}' hasn't received a pipeline, and it hasn't been\"\n \" created within a `Pipeline` context. Please, use\"\n \" `with Pipeline() as pipeline:` and create the step within the context.\"\n )\n\n if not self.name:\n # This must be done before the check for repeated names, but assuming\n # we are passing the pipeline from the _GlobalPipelineManager, should\n # be done after that.\n self.name = _infer_step_name(type(self).__name__, self.pipeline)\n\n if self.pipeline is not None:\n # If not set an error will be raised in `Pipeline.run` parent\n self.pipeline._add_step(self)\n\n def connect(\n self,\n *steps: \"_Step\",\n routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n ) -> None:\n \"\"\"Connects the current step to another step in the pipeline, which means that\n the output of this step will be the input of the other step.\n\n Args:\n steps: The steps to connect to the current step.\n routing_batch_function: A function that receives a list of steps and returns\n a list of steps to which the output batch generated by this step should be\n routed. It should be used to define the routing logic of the pipeline. If\n not provided, the output batch will be routed to all the connected steps.\n Defaults to `None`.\n \"\"\"\n assert self.pipeline is not None\n\n if routing_batch_function:\n self._set_routing_batch_function(routing_batch_function)\n\n for step in steps:\n self.pipeline._add_edge(from_step=self.name, to_step=step.name) # type: ignore\n\n def _set_routing_batch_function(\n self, routing_batch_function: \"RoutingBatchFunction\"\n ) -> None:\n \"\"\"Sets a routing batch function for the batches generated by this step, so they\n get routed to specific downstream steps.\n\n Args:\n routing_batch_function: The routing batch function that will be used to route\n the batches generated by this step.\n \"\"\"\n self.pipeline._add_routing_batch_function(\n step_name=self.name, # type: ignore\n routing_batch_function=routing_batch_function,\n )\n routing_batch_function._step = self\n\n @overload\n def __rshift__(self, other: \"RoutingBatchFunction\") -> \"RoutingBatchFunction\": ...\n\n @overload\n def __rshift__(\n self, other: List[\"DownstreamConnectableSteps\"]\n ) -> List[\"DownstreamConnectableSteps\"]: ...\n\n @overload\n def __rshift__(self, other: \"DownstreamConnectable\") -> \"DownstreamConnectable\": ...\n\n def __rshift__(\n self,\n other: Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ],\n ) -> Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ]:\n \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n Args:\n other: The step to connect, a list of steps to connect to or a routing batch\n function to be set for the step.\n\n Returns:\n The connected step, the list of connected steps or the routing batch function.\n\n Example:\n ```python\n step1 >> step2\n # Would be equivalent to:\n step1.connect(step2)\n\n # It also allows to connect a list of steps\n step1 >> [step2, step3]\n ```\n \"\"\"\n # Here to avoid circular imports\n from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n if isinstance(other, list):\n self.connect(*other)\n return other\n\n if isinstance(other, RoutingBatchFunction):\n self._set_routing_batch_function(other)\n return other\n\n self.connect(other)\n return other\n\n def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n to a single step, as the list doesn't have the __rshift__ operator.\n\n Args:\n other: The step to connect to.\n\n Returns:\n The connected step\n\n Example:\n ```python\n [step2, step3] >> step1\n # Would be equivalent to:\n step2.connect(step1)\n step3.connect(step1)\n ```\n \"\"\"\n for o in other:\n o.connect(self)\n return self\n\n def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n\n def unload(self) -> None:\n \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n example, to close a connection to a database, etc.\n \"\"\"\n self._logger.debug(\"Executing step unload logic.\")\n\n @property\n def is_generator(self) -> bool:\n \"\"\"Whether the step is a generator step or not.\n\n Returns:\n `True` if the step is a generator step, `False` otherwise.\n \"\"\"\n return isinstance(self, GeneratorStep)\n\n @property\n def is_global(self) -> bool:\n \"\"\"Whether the step is a global step or not.\n\n Returns:\n `True` if the step is a global step, `False` otherwise.\n \"\"\"\n return isinstance(self, GlobalStep)\n\n @property\n def is_normal(self) -> bool:\n \"\"\"Whether the step is a normal step or not.\n\n Returns:\n `True` if the step is a normal step, `False` otherwise.\n \"\"\"\n return not self.is_generator and not self.is_global\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of strings with the names of the mandatory columns that the step needs as\n input or dictionary in which the keys are the input columns of the step and the\n values are booleans indicating whether the column is optional or not.\n\n Returns:\n List of strings with the names of the columns that the step needs as input.\n \"\"\"\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of strings with the names of the columns that the step will produce as\n output or dictionary in which the keys are the output columns of the step and the\n values are booleans indicating whether the column is optional or not.\n\n Returns:\n List of strings with the names of the columns that the step will produce as\n output.\n \"\"\"\n return []\n\n @cached_property\n def process_parameters(self) -> List[inspect.Parameter]:\n \"\"\"Returns the parameters of the `process` method of the step.\n\n Returns:\n The parameters of the `process` method of the step.\n \"\"\"\n return list(inspect.signature(self.process).parameters.values()) # type: ignore\n\n def has_multiple_inputs(self) -> bool:\n \"\"\"Whether the `process` method of the step receives more than one input or not\n i.e. has a `*` argument annotated with `StepInput`.\n\n Returns:\n `True` if the `process` method of the step receives more than one input,\n `False` otherwise.\n \"\"\"\n return any(\n param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n )\n\n def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n \"\"\"Returns the parameter of the `process` method of the step annotated with\n `StepInput`.\n\n Returns:\n The parameter of the `process` method of the step annotated with `StepInput`,\n or `None` if there is no parameter annotated with `StepInput`.\n\n Raises:\n TypeError: If the step has more than one parameter annotated with `StepInput`.\n \"\"\"\n step_input_parameter = None\n for parameter in self.process_parameters:\n if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n if step_input_parameter is not None:\n raise DistilabelTypeError(\n f\"Step '{self.name}' should have only one parameter with type\"\n \" hint `StepInput`.\",\n page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n )\n step_input_parameter = parameter\n return step_input_parameter\n\n def verify_inputs_mappings(self) -> None:\n \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n columns exist in the inputs of the step.\n\n Raises:\n ValueError: If the `inputs_mappings` of the step are not valid.\n \"\"\"\n if not self.input_mappings:\n return\n\n for input in self.input_mappings:\n if input not in self.inputs:\n raise DistilabelUserError(\n f\"The input column '{input}' doesn't exist in the inputs of the\"\n f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n \" Please, review the `inputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n\n def verify_outputs_mappings(self) -> None:\n \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n columns exist in the outputs of the step.\n\n Raises:\n ValueError: If the `outputs_mappings` of the step are not valid.\n \"\"\"\n if not self.output_mappings:\n return\n\n for output in self.output_mappings:\n if output not in self.outputs:\n raise DistilabelUserError(\n f\"The output column '{output}' doesn't exist in the outputs of the\"\n f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n \" Please, review the `outputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n\n def get_inputs(self) -> Dict[str, bool]:\n \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n to be used to run validations on the inputs of the step.\n\n Returns:\n The inputs of the step after the `input_mappings` and if they are required or\n not.\n \"\"\"\n if isinstance(self.inputs, list):\n return {\n self.input_mappings.get(input, input): True for input in self.inputs\n }\n\n return {\n self.input_mappings.get(input, input): required\n for input, required in self.inputs.items()\n }\n\n def get_outputs(self) -> Dict[str, bool]:\n \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n meant to be used to run validations on the outputs of the step.\n\n Returns:\n The outputs of the step after the `outputs_mappings` and if they are required\n or not.\n \"\"\"\n if isinstance(self.outputs, list):\n return {\n self.output_mappings.get(output, output): True\n for output in self.outputs\n }\n\n return {\n self.output_mappings.get(output, output): required\n for output, required in self.outputs.items()\n }\n\n def set_pipeline_artifacts_path(self, path: Path) -> None:\n \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n by the `Pipeline` once the cache location is known.\n\n Args:\n path: the path where the artifacts generated by the pipeline steps should be\n saved.\n \"\"\"\n self._pipeline_artifacts_path = path\n\n @property\n def artifacts_directory(self) -> Union[Path, None]:\n \"\"\"Gets the path of the directory where the step should save its generated artifacts.\n\n Returns:\n The path of the directory where the step should save the generated artifacts,\n or `None` if `_pipeline_artifacts_path` is not set.\n \"\"\"\n if self._pipeline_artifacts_path is None:\n return None\n return self._pipeline_artifacts_path / self.name # type: ignore\n\n def save_artifact(\n self,\n name: str,\n write_function: Callable[[Path], None],\n metadata: Optional[Dict[str, Any]] = None,\n ) -> None:\n \"\"\"Saves an artifact generated by the `Step`.\n\n Args:\n name: the name of the artifact.\n write_function: a function that will receive the path where the artifact should\n be saved.\n metadata: the artifact metadata. Defaults to `None`.\n \"\"\"\n if self.artifacts_directory is None:\n self._logger.warning(\n f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n \" set. This is normal if the `Step` is being executed as a standalone component.\"\n )\n return\n\n artifact_directory_path = self.artifacts_directory / name\n artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n self._logger.debug(\n f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n )\n write_function(artifact_directory_path)\n\n metadata_path = artifact_directory_path / \"metadata.json\"\n self._logger.debug(\n f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n )\n write_json(filename=metadata_path, data=metadata or {})\n\n def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n ) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the output columns of the step that are not present in the step output.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n result.append(data)\n return result\n\n def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n dump = super()._model_dump(obj, **kwargs)\n dump[\"runtime_parameters_info\"] = self.get_runtime_parameters_info()\n return dump\n "},{"location":"api/step/#distilabel.steps.base._Step.is_generator","title":"is_generator: bool property ","text":"Whether the step is a generator step or not. Returns: Type Description bool True if the step is a generator step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.is_global","title":"is_global: bool property ","text":"Whether the step is a global step or not. Returns: Type Description bool True if the step is a global step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.is_normal","title":"is_normal: bool property ","text":"Whether the step is a normal step or not. Returns: Type Description bool True if the step is a normal step, False otherwise. "},{"location":"api/step/#distilabel.steps.base._Step.inputs","title":"inputs: StepColumns property ","text":"List of strings with the names of the mandatory columns that the step needs as input or dictionary in which the keys are the input columns of the step and the values are booleans indicating whether the column is optional or not. Returns: Type Description StepColumns List of strings with the names of the columns that the step needs as input. "},{"location":"api/step/#distilabel.steps.base._Step.outputs","title":"outputs: StepColumns property ","text":"List of strings with the names of the columns that the step will produce as output or dictionary in which the keys are the output columns of the step and the values are booleans indicating whether the column is optional or not. Returns: Type Description StepColumns List of strings with the names of the columns that the step will produce as StepColumns output. "},{"location":"api/step/#distilabel.steps.base._Step.process_parameters","title":"process_parameters: List[inspect.Parameter] cached property ","text":"Returns the parameters of the process method of the step. Returns: Type Description List[Parameter] The parameters of the process method of the step. "},{"location":"api/step/#distilabel.steps.base._Step.artifacts_directory","title":"artifacts_directory: Union[Path, None] property ","text":"Gets the path of the directory where the step should save its generated artifacts. Returns: Type Description Union[Path, None] The path of the directory where the step should save the generated artifacts, or None if _pipeline_artifacts_path is not set. "},{"location":"api/step/#distilabel.steps.base._Step.connect","title":"connect(*steps, routing_batch_function=None) ","text":"Connects the current step to another step in the pipeline, which means that the output of this step will be the input of the other step. Parameters: Name Type Description Default steps _Step The steps to connect to the current step. () routing_batch_function Optional[RoutingBatchFunction] A function that receives a list of steps and returns a list of steps to which the output batch generated by this step should be routed. It should be used to define the routing logic of the pipeline. If not provided, the output batch will be routed to all the connected steps. Defaults to None . None Source code in src/distilabel/steps/base.py def connect(\n self,\n *steps: \"_Step\",\n routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n) -> None:\n \"\"\"Connects the current step to another step in the pipeline, which means that\n the output of this step will be the input of the other step.\n\n Args:\n steps: The steps to connect to the current step.\n routing_batch_function: A function that receives a list of steps and returns\n a list of steps to which the output batch generated by this step should be\n routed. It should be used to define the routing logic of the pipeline. If\n not provided, the output batch will be routed to all the connected steps.\n Defaults to `None`.\n \"\"\"\n assert self.pipeline is not None\n\n if routing_batch_function:\n self._set_routing_batch_function(routing_batch_function)\n\n for step in steps:\n self.pipeline._add_edge(from_step=self.name, to_step=step.name) # type: ignore\n "},{"location":"api/step/#distilabel.steps.base._Step.__rshift__","title":"__rshift__(other) ","text":"__rshift__(other: RoutingBatchFunction) -> RoutingBatchFunction\n
__rshift__(other: List[DownstreamConnectableSteps]) -> List[DownstreamConnectableSteps]\n
__rshift__(other: DownstreamConnectable) -> DownstreamConnectable\n Allows using the >> operator to connect steps in the pipeline. Parameters: Name Type Description Default other Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]] The step to connect, a list of steps to connect to or a routing batch function to be set for the step. required Returns: Type Description Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]] The connected step, the list of connected steps or the routing batch function. Example step1 >> step2\n# Would be equivalent to:\nstep1.connect(step2)\n\n# It also allows to connect a list of steps\nstep1 >> [step2, step3]\n Source code in src/distilabel/steps/base.py def __rshift__(\n self,\n other: Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n ],\n) -> Union[\n \"DownstreamConnectable\",\n \"RoutingBatchFunction\",\n List[\"DownstreamConnectableSteps\"],\n]:\n \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n Args:\n other: The step to connect, a list of steps to connect to or a routing batch\n function to be set for the step.\n\n Returns:\n The connected step, the list of connected steps or the routing batch function.\n\n Example:\n ```python\n step1 >> step2\n # Would be equivalent to:\n step1.connect(step2)\n\n # It also allows to connect a list of steps\n step1 >> [step2, step3]\n ```\n \"\"\"\n # Here to avoid circular imports\n from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n if isinstance(other, list):\n self.connect(*other)\n return other\n\n if isinstance(other, RoutingBatchFunction):\n self._set_routing_batch_function(other)\n return other\n\n self.connect(other)\n return other\n "},{"location":"api/step/#distilabel.steps.base._Step.__rrshift__","title":"__rrshift__(other) ","text":"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline to a single step, as the list doesn't have the rshift operator. Parameters: Name Type Description Default other List[UpstreamConnectableSteps] The step to connect to. required Returns: Type Description Self The connected step Example [step2, step3] >> step1\n# Would be equivalent to:\nstep2.connect(step1)\nstep3.connect(step1)\n Source code in src/distilabel/steps/base.py def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n to a single step, as the list doesn't have the __rshift__ operator.\n\n Args:\n other: The step to connect to.\n\n Returns:\n The connected step\n\n Example:\n ```python\n [step2, step3] >> step1\n # Would be equivalent to:\n step2.connect(step1)\n step3.connect(step1)\n ```\n \"\"\"\n for o in other:\n o.connect(self)\n return self\n "},{"location":"api/step/#distilabel.steps.base._Step.load","title":"load() ","text":"Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Source code in src/distilabel/steps/base.py def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n "},{"location":"api/step/#distilabel.steps.base._Step.unload","title":"unload() ","text":"Method to perform any cleanup logic after the process method is called. For example, to close a connection to a database, etc. Source code in src/distilabel/steps/base.py def unload(self) -> None:\n \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n example, to close a connection to a database, etc.\n \"\"\"\n self._logger.debug(\"Executing step unload logic.\")\n "},{"location":"api/step/#distilabel.steps.base._Step.has_multiple_inputs","title":"has_multiple_inputs() ","text":"Whether the process method of the step receives more than one input or not i.e. has a * argument annotated with StepInput . Returns: Type Description bool True if the process method of the step receives more than one input, bool False otherwise. Source code in src/distilabel/steps/base.py def has_multiple_inputs(self) -> bool:\n \"\"\"Whether the `process` method of the step receives more than one input or not\n i.e. has a `*` argument annotated with `StepInput`.\n\n Returns:\n `True` if the `process` method of the step receives more than one input,\n `False` otherwise.\n \"\"\"\n return any(\n param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.get_process_step_input","title":"get_process_step_input() ","text":"Returns the parameter of the process method of the step annotated with StepInput . Returns: Type Description Union[Parameter, None] The parameter of the process method of the step annotated with StepInput , Union[Parameter, None] or None if there is no parameter annotated with StepInput . Raises: Type Description TypeError If the step has more than one parameter annotated with StepInput . Source code in src/distilabel/steps/base.py def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n \"\"\"Returns the parameter of the `process` method of the step annotated with\n `StepInput`.\n\n Returns:\n The parameter of the `process` method of the step annotated with `StepInput`,\n or `None` if there is no parameter annotated with `StepInput`.\n\n Raises:\n TypeError: If the step has more than one parameter annotated with `StepInput`.\n \"\"\"\n step_input_parameter = None\n for parameter in self.process_parameters:\n if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n if step_input_parameter is not None:\n raise DistilabelTypeError(\n f\"Step '{self.name}' should have only one parameter with type\"\n \" hint `StepInput`.\",\n page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n )\n step_input_parameter = parameter\n return step_input_parameter\n "},{"location":"api/step/#distilabel.steps.base._Step.verify_inputs_mappings","title":"verify_inputs_mappings() ","text":"Verifies that the inputs_mappings of the step are valid i.e. the input columns exist in the inputs of the step. Raises: Type Description ValueError If the inputs_mappings of the step are not valid. Source code in src/distilabel/steps/base.py def verify_inputs_mappings(self) -> None:\n \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n columns exist in the inputs of the step.\n\n Raises:\n ValueError: If the `inputs_mappings` of the step are not valid.\n \"\"\"\n if not self.input_mappings:\n return\n\n for input in self.input_mappings:\n if input not in self.inputs:\n raise DistilabelUserError(\n f\"The input column '{input}' doesn't exist in the inputs of the\"\n f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n \" Please, review the `inputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.verify_outputs_mappings","title":"verify_outputs_mappings() ","text":"Verifies that the outputs_mappings of the step are valid i.e. the output columns exist in the outputs of the step. Raises: Type Description ValueError If the outputs_mappings of the step are not valid. Source code in src/distilabel/steps/base.py def verify_outputs_mappings(self) -> None:\n \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n columns exist in the outputs of the step.\n\n Raises:\n ValueError: If the `outputs_mappings` of the step are not valid.\n \"\"\"\n if not self.output_mappings:\n return\n\n for output in self.output_mappings:\n if output not in self.outputs:\n raise DistilabelUserError(\n f\"The output column '{output}' doesn't exist in the outputs of the\"\n f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n \" Please, review the `outputs_mappings` argument of the step.\",\n page=\"sections/how_to_guides/basic/step/#arguments\",\n )\n "},{"location":"api/step/#distilabel.steps.base._Step.get_inputs","title":"get_inputs() ","text":"Gets the inputs of the step after the input_mappings . This method is meant to be used to run validations on the inputs of the step. Returns: Type Description Dict[str, bool] The inputs of the step after the input_mappings and if they are required or Dict[str, bool] not. Source code in src/distilabel/steps/base.py def get_inputs(self) -> Dict[str, bool]:\n \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n to be used to run validations on the inputs of the step.\n\n Returns:\n The inputs of the step after the `input_mappings` and if they are required or\n not.\n \"\"\"\n if isinstance(self.inputs, list):\n return {\n self.input_mappings.get(input, input): True for input in self.inputs\n }\n\n return {\n self.input_mappings.get(input, input): required\n for input, required in self.inputs.items()\n }\n "},{"location":"api/step/#distilabel.steps.base._Step.get_outputs","title":"get_outputs() ","text":"Gets the outputs of the step after the outputs_mappings . This method is meant to be used to run validations on the outputs of the step. Returns: Type Description Dict[str, bool] The outputs of the step after the outputs_mappings and if they are required Dict[str, bool] or not. Source code in src/distilabel/steps/base.py def get_outputs(self) -> Dict[str, bool]:\n \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n meant to be used to run validations on the outputs of the step.\n\n Returns:\n The outputs of the step after the `outputs_mappings` and if they are required\n or not.\n \"\"\"\n if isinstance(self.outputs, list):\n return {\n self.output_mappings.get(output, output): True\n for output in self.outputs\n }\n\n return {\n self.output_mappings.get(output, output): required\n for output, required in self.outputs.items()\n }\n "},{"location":"api/step/#distilabel.steps.base._Step.set_pipeline_artifacts_path","title":"set_pipeline_artifacts_path(path) ","text":"Sets the _pipeline_artifacts_path attribute. This method is meant to be used by the Pipeline once the cache location is known. Parameters: Name Type Description Default path Path the path where the artifacts generated by the pipeline steps should be saved. required Source code in src/distilabel/steps/base.py def set_pipeline_artifacts_path(self, path: Path) -> None:\n \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n by the `Pipeline` once the cache location is known.\n\n Args:\n path: the path where the artifacts generated by the pipeline steps should be\n saved.\n \"\"\"\n self._pipeline_artifacts_path = path\n "},{"location":"api/step/#distilabel.steps.base._Step.save_artifact","title":"save_artifact(name, write_function, metadata=None) ","text":"Saves an artifact generated by the Step . Parameters: Name Type Description Default name str the name of the artifact. required write_function Callable[[Path], None] a function that will receive the path where the artifact should be saved. required metadata Optional[Dict[str, Any]] the artifact metadata. Defaults to None . None Source code in src/distilabel/steps/base.py def save_artifact(\n self,\n name: str,\n write_function: Callable[[Path], None],\n metadata: Optional[Dict[str, Any]] = None,\n) -> None:\n \"\"\"Saves an artifact generated by the `Step`.\n\n Args:\n name: the name of the artifact.\n write_function: a function that will receive the path where the artifact should\n be saved.\n metadata: the artifact metadata. Defaults to `None`.\n \"\"\"\n if self.artifacts_directory is None:\n self._logger.warning(\n f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n \" set. This is normal if the `Step` is being executed as a standalone component.\"\n )\n return\n\n artifact_directory_path = self.artifacts_directory / name\n artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n self._logger.debug(\n f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n )\n write_function(artifact_directory_path)\n\n metadata_path = artifact_directory_path / \"metadata.json\"\n self._logger.debug(\n f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n )\n write_json(filename=metadata_path, data=metadata or {})\n "},{"location":"api/step/#distilabel.steps.base._Step.impute_step_outputs","title":"impute_step_outputs(step_output) ","text":"Imputes the output columns of the step that are not present in the step output. Source code in src/distilabel/steps/base.py def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the output columns of the step that are not present in the step output.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n result.append(data)\n return result\n "},{"location":"api/step/#distilabel.steps.base.Step","title":"Step ","text":" Bases: _Step , ABC Base class for the steps that can be included in a Pipeline . Attributes: Name Type Description input_batch_size RuntimeParameter[PositiveInt] The number of rows that will contain the batches processed by the step. Defaults to 50 . Runtime parameters input_batch_size : The number of rows that will contain the batches processed by the step. Defaults to 50 . Source code in src/distilabel/steps/base.py class Step(_Step, ABC):\n \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n Attributes:\n input_batch_size: The number of rows that will contain the batches processed by\n the step. Defaults to `50`.\n\n Runtime parameters:\n - `input_batch_size`: The number of rows that will contain the batches processed\n by the step. Defaults to `50`.\n \"\"\"\n\n input_batch_size: RuntimeParameter[PositiveInt] = Field(\n default=DEFAULT_INPUT_BATCH_SIZE,\n description=\"The number of rows that will contain the batches processed by the\"\n \" step.\",\n )\n\n @abstractmethod\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"Method that defines the processing logic of the step. It should yield the\n output rows.\n\n Args:\n *inputs: An argument used to receive the outputs of the previous steps. The\n number of arguments depends on the number of previous steps. It doesn't\n need to be an `*args` argument, it can be a regular argument annotated\n with `StepInput` if the step has only one previous step.\n \"\"\"\n pass\n\n def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n rows and the `outputs_mappings` to the output rows. This is the function that\n should be used to run the processing logic of the step.\n\n Yields:\n The output rows.\n \"\"\"\n\n inputs, overriden_inputs = (\n self._apply_input_mappings(args)\n if self.input_mappings\n else (args, [{} for _ in range(len(args[0]))])\n )\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as kwargs, so they can be used within the processing\n # function\n generator = (\n self.process(*inputs)\n if not self._built_from_decorator\n else self.process(*inputs, **self._runtime_parameters)\n )\n\n for output_rows in generator:\n restored = []\n for i, row in enumerate(output_rows):\n # Correct the index here because we don't know the num_generations from the llm\n # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n # from `num_generations==2` and `group_generations=False` in the LLM:\n # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n ntimes_i = i % len(overriden_inputs)\n restored.append(\n self._apply_mappings_and_restore_overriden(\n row, overriden_inputs[ntimes_i]\n )\n )\n yield restored\n\n def _apply_input_mappings(\n self, inputs: Tuple[List[Dict[str, Any]], ...]\n ) -> Tuple[Tuple[List[Dict[str, Any]], ...], List[Dict[str, Any]]]:\n \"\"\"Applies the `input_mappings` to the input rows.\n\n Args:\n inputs: The input rows.\n\n Returns:\n The input rows with the `input_mappings` applied and the overriden values\n that were replaced by the `input_mappings`.\n \"\"\"\n reverted_input_mappings = {v: k for k, v in self.input_mappings.items()}\n\n renamed_inputs = []\n overriden_inputs = []\n for i, row_inputs in enumerate(inputs):\n renamed_row_inputs = []\n for row in row_inputs:\n overriden_keys = {}\n renamed_row = {}\n for k, v in row.items():\n renamed_key = reverted_input_mappings.get(k, k)\n\n if renamed_key not in renamed_row or k != renamed_key:\n renamed_row[renamed_key] = v\n\n if k != renamed_key and renamed_key in row and len(inputs) == 1:\n overriden_keys[renamed_key] = row[renamed_key]\n\n if i == 0:\n overriden_inputs.append(overriden_keys)\n renamed_row_inputs.append(renamed_row)\n renamed_inputs.append(renamed_row_inputs)\n return tuple(renamed_inputs), overriden_inputs\n\n def _apply_mappings_and_restore_overriden(\n self, row: Dict[str, Any], overriden: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Reverts the `input_mappings` applied to the input rows and applies the `output_mappings`\n to the output rows. In addition, it restores the overriden values that were replaced\n by the `input_mappings`.\n\n Args:\n row: The output row.\n overriden: The overriden values that were replaced by the `input_mappings`.\n\n Returns:\n The output row with the `output_mappings` applied and the overriden values\n restored.\n \"\"\"\n result = {}\n for k, v in row.items():\n mapped_key = (\n self.output_mappings.get(k, None)\n or self.input_mappings.get(k, None)\n or k\n )\n result[mapped_key] = v\n\n # Restore overriden values\n for k, v in overriden.items():\n if k not in result:\n result[k] = v\n\n return result\n "},{"location":"api/step/#distilabel.steps.base.Step.process","title":"process(*inputs) abstractmethod ","text":"Method that defines the processing logic of the step. It should yield the output rows. Parameters: Name Type Description Default *inputs StepInput An argument used to receive the outputs of the previous steps. The number of arguments depends on the number of previous steps. It doesn't need to be an *args argument, it can be a regular argument annotated with StepInput if the step has only one previous step. () Source code in src/distilabel/steps/base.py @abstractmethod\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"Method that defines the processing logic of the step. It should yield the\n output rows.\n\n Args:\n *inputs: An argument used to receive the outputs of the previous steps. The\n number of arguments depends on the number of previous steps. It doesn't\n need to be an `*args` argument, it can be a regular argument annotated\n with `StepInput` if the step has only one previous step.\n \"\"\"\n pass\n "},{"location":"api/step/#distilabel.steps.base.Step.process_applying_mappings","title":"process_applying_mappings(*args) ","text":"Runs the process method of the step applying the input_mappings to the input rows and the outputs_mappings to the output rows. This is the function that should be used to run the processing logic of the step. Yields: Type Description StepOutput The output rows. Source code in src/distilabel/steps/base.py def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n rows and the `outputs_mappings` to the output rows. This is the function that\n should be used to run the processing logic of the step.\n\n Yields:\n The output rows.\n \"\"\"\n\n inputs, overriden_inputs = (\n self._apply_input_mappings(args)\n if self.input_mappings\n else (args, [{} for _ in range(len(args[0]))])\n )\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as kwargs, so they can be used within the processing\n # function\n generator = (\n self.process(*inputs)\n if not self._built_from_decorator\n else self.process(*inputs, **self._runtime_parameters)\n )\n\n for output_rows in generator:\n restored = []\n for i, row in enumerate(output_rows):\n # Correct the index here because we don't know the num_generations from the llm\n # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n # from `num_generations==2` and `group_generations=False` in the LLM:\n # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n ntimes_i = i % len(overriden_inputs)\n restored.append(\n self._apply_mappings_and_restore_overriden(\n row, overriden_inputs[ntimes_i]\n )\n )\n yield restored\n "},{"location":"api/step/decorator/","title":"@step","text":"This section contains the reference for the @step decorator, used to create new Step subclasses without having to manually define the class. For more information check the Tutorial - Step page. "},{"location":"api/step/decorator/#distilabel.steps.decorator","title":"decorator ","text":""},{"location":"api/step/decorator/#distilabel.steps.decorator.step","title":"step(inputs=None, outputs=None, step_type='normal') ","text":"step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['normal'] = 'normal') -> Callable[..., Type[Step]]\n
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['global'] = 'global') -> Callable[..., Type[GlobalStep]]\n
step(inputs: None = None, outputs: Union[StepColumns, None] = None, step_type: Literal['generator'] = 'generator') -> Callable[..., Type[GeneratorStep]]\n Creates an Step from a processing function. Parameters: Name Type Description Default inputs Union[StepColumns, None] a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None outputs Union[StepColumns, None] a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None step_type Literal['normal', 'global', 'generator'] the kind of step to create. Valid choices are: \"normal\" (Step ), \"global\" (GlobalStep ) or \"generator\" (GeneratorStep ). Defaults to \"normal\" . 'normal' Returns: Type Description Callable[..., Type[_Step]] A callable that will generate the type given the processing function. Example: # Normal step\n@step(inputs=[\"instruction\"], outputs=[\"generation\"])\ndef GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n for input in inputs:\n input[\"generation\"] = dummy_generation\n yield inputs\n\n# Global step\n@step(inputs=[\"instruction\"], step_type=\"global\")\ndef FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n yield [\n input\n for input in inputs\n if len(input[\"instruction\"]) <= max_length\n ]\n\n# Generator step\n@step(outputs=[\"num\"], step_type=\"generator\")\ndef RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n data = list(range(num_rows))\n for i in range(0, len(data), 100):\n last_batch = i + 100 >= len(data)\n yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n Source code in src/distilabel/steps/decorator.py def step(\n inputs: Union[\"StepColumns\", None] = None,\n outputs: Union[\"StepColumns\", None] = None,\n step_type: Literal[\"normal\", \"global\", \"generator\"] = \"normal\",\n) -> Callable[..., Type[\"_Step\"]]:\n \"\"\"Creates an `Step` from a processing function.\n\n Args:\n inputs: a list containing the name of the inputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column is required or not, that are required by the step. If not provided\n the default will be an empty list `[]` and it will be assumed that the step\n doesn't need any specific columns. Defaults to `None`.\n outputs: a list containing the name of the outputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column will be generated or not. If not provided the default will be an\n empty list `[]` and it will be assumed that the step doesn't need any specific\n columns. Defaults to `None`.\n step_type: the kind of step to create. Valid choices are: \"normal\" (`Step`),\n \"global\" (`GlobalStep`) or \"generator\" (`GeneratorStep`). Defaults to\n `\"normal\"`.\n\n Returns:\n A callable that will generate the type given the processing function.\n\n Example:\n\n ```python\n # Normal step\n @step(inputs=[\"instruction\"], outputs=[\"generation\"])\n def GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n for input in inputs:\n input[\"generation\"] = dummy_generation\n yield inputs\n\n # Global step\n @step(inputs=[\"instruction\"], step_type=\"global\")\n def FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n yield [\n input\n for input in inputs\n if len(input[\"instruction\"]) <= max_length\n ]\n\n # Generator step\n @step(outputs=[\"num\"], step_type=\"generator\")\n def RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n data = list(range(num_rows))\n for i in range(0, len(data), 100):\n last_batch = i + 100 >= len(data)\n yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n ```\n \"\"\"\n\n inputs = inputs or []\n outputs = outputs or []\n\n def decorator(func: ProcessingFunc) -> Type[\"_Step\"]:\n if step_type not in _STEP_MAPPING:\n raise ValueError(\n f\"Invalid step type '{step_type}'. Please, review the '{func.__name__}'\"\n \" function decorated with the `@step` decorator and provide a valid\"\n \" `step_type`. Valid choices are: 'normal', 'global' or 'generator'.\"\n )\n\n BaseClass = _STEP_MAPPING[step_type]\n\n signature = inspect.signature(func)\n\n runtime_parameters = {\n name: (\n param.annotation,\n param.default if param.default != param.empty else None,\n )\n for name, param in signature.parameters.items()\n }\n\n runtime_parameters = {}\n step_input_parameter = None\n for name, param in signature.parameters.items():\n if is_parameter_annotated_with(param, _RUNTIME_PARAMETER_ANNOTATION):\n runtime_parameters[name] = (\n param.annotation,\n param.default if param.default != param.empty else None,\n )\n\n if not step_type == \"generator\" and is_parameter_annotated_with(\n param, _STEP_INPUT_ANNOTATION\n ):\n if step_input_parameter is not None:\n raise ValueError(\n f\"Function '{func.__name__}' has more than one parameter annotated\"\n f\" with `StepInput`. Please, review the '{func.__name__}' function\"\n \" decorated with the `@step` decorator and provide only one\"\n \" argument annotated with `StepInput`.\"\n )\n step_input_parameter = param\n\n RuntimeParametersModel = create_model( # type: ignore\n \"RuntimeParametersModel\",\n **runtime_parameters, # type: ignore\n )\n\n def inputs_property(self) -> \"StepColumns\":\n return inputs\n\n def outputs_property(self) -> \"StepColumns\":\n return outputs\n\n def process(\n self, *args: Any, **kwargs: Any\n ) -> Union[\"StepOutput\", \"GeneratorStepOutput\"]:\n return func(*args, **kwargs)\n\n return type( # type: ignore\n func.__name__,\n (\n BaseClass,\n RuntimeParametersModel,\n ),\n {\n \"process\": process,\n \"inputs\": property(inputs_property),\n \"outputs\": property(outputs_property),\n \"__module__\": func.__module__,\n \"__doc__\": func.__doc__,\n \"_built_from_decorator\": True,\n # Override the `get_process_step_input` method to return the parameter\n # of the original function annotated with `StepInput`.\n \"get_process_step_input\": lambda self: step_input_parameter,\n },\n )\n\n return decorator\n "},{"location":"api/step/generator_step/","title":"GeneratorStep","text":"This section contains the API reference for the GeneratorStep class. For more information and examples on how to use existing generator steps or create custom ones, please refer to Tutorial - Step - GeneratorStep. "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep","title":"GeneratorStep ","text":" Bases: _Step , ABC A special kind of Step that is able to generate data i.e. it doesn't receive any input from the previous steps. Attributes: Name Type Description batch_size RuntimeParameter[int] The number of rows that will contain the batches generated by the step. Defaults to 50 . Runtime parameters batch_size : The number of rows that will contain the batches generated by the step. Defaults to 50 . Source code in src/distilabel/steps/base.py class GeneratorStep(_Step, ABC):\n \"\"\"A special kind of `Step` that is able to generate data i.e. it doesn't receive\n any input from the previous steps.\n\n Attributes:\n batch_size: The number of rows that will contain the batches generated by the\n step. Defaults to `50`.\n\n Runtime parameters:\n - `batch_size`: The number of rows that will contain the batches generated by\n the step. Defaults to `50`.\n \"\"\"\n\n batch_size: RuntimeParameter[int] = Field(\n default=50,\n description=\"The number of rows that will contain the batches generated by the\"\n \" step.\",\n )\n\n @abstractmethod\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Method that defines the generation logic of the step. It should yield the\n output rows and a boolean indicating if it's the last batch or not.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n pass\n\n def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n output rows. This is the function that should be used to run the generation logic\n of the step.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as `kwargs`, so they can be used within the processing\n # function\n generator = (\n self.process(offset=offset)\n if not self._built_from_decorator\n else self.process(offset=offset, **self._runtime_parameters)\n )\n\n for output_rows, last_batch in generator:\n yield (\n [\n {self.output_mappings.get(k, k): v for k, v in row.items()}\n for row in output_rows\n ],\n last_batch,\n )\n "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process","title":"process(offset=0) abstractmethod ","text":"Method that defines the generation logic of the step. It should yield the output rows and a boolean indicating if it's the last batch or not. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput The output rows and a boolean indicating if it's the last batch or not. Source code in src/distilabel/steps/base.py @abstractmethod\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Method that defines the generation logic of the step. It should yield the\n output rows and a boolean indicating if it's the last batch or not.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n pass\n "},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process_applying_mappings","title":"process_applying_mappings(offset=0) ","text":"Runs the process method of the step applying the outputs_mappings to the output rows. This is the function that should be used to run the generation logic of the step. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput The output rows and a boolean indicating if it's the last batch or not. Source code in src/distilabel/steps/base.py def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n output rows. This is the function that should be used to run the generation logic\n of the step.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n The output rows and a boolean indicating if it's the last batch or not.\n \"\"\"\n\n # If the `Step` was built using the `@step` decorator, then we need to pass\n # the runtime parameters as `kwargs`, so they can be used within the processing\n # function\n generator = (\n self.process(offset=offset)\n if not self._built_from_decorator\n else self.process(offset=offset, **self._runtime_parameters)\n )\n\n for output_rows, last_batch in generator:\n yield (\n [\n {self.output_mappings.get(k, k): v for k, v in row.items()}\n for row in output_rows\n ],\n last_batch,\n )\n "},{"location":"api/step/generator_step/#distilabel.steps.generators.utils.make_generator_step","title":"make_generator_step(dataset, pipeline=None, batch_size=50, input_mappings=None, output_mappings=None, resources=StepResources(), repo_id='default_name') ","text":"Helper method to create a GeneratorStep from a dataset, to simplify Parameters: Name Type Description Default dataset Union[Dataset, DataFrame, List[Dict[str, str]]] The dataset to use in the Pipeline . required batch_size int The batch_size, will default to the same used by the GeneratorStep s. Defaults to 50 . 50 input_mappings Optional[Dict[str, str]] Applies the same as any other step. Defaults to None . None output_mappings Optional[Dict[str, str]] Applies the same as any other step. Defaults to None . None resources StepResources Applies the same as any other step. Defaults to StepResources() . StepResources() repo_id Optional[str] The repository ID to use in the LoadDataFromHub step. This shouldn't be necessary, but in case of error, the dataset will try to be loaded using load_dataset internally. If that case happens, the repo_id will be used. 'default_name' Raises: Type Description ValueError If the format is different from the ones supported. Returns: Type Description GeneratorStep A LoadDataFromDicts if the input is a list of dicts, or LoadDataFromHub instance GeneratorStep if the input is a pd.DataFrame or a Dataset . Source code in src/distilabel/steps/generators/utils.py def make_generator_step(\n dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]],\n pipeline: Union[\"BasePipeline\", None] = None,\n batch_size: int = 50,\n input_mappings: Optional[Dict[str, str]] = None,\n output_mappings: Optional[Dict[str, str]] = None,\n resources: StepResources = StepResources(),\n repo_id: Optional[str] = \"default_name\",\n) -> \"GeneratorStep\":\n \"\"\"Helper method to create a `GeneratorStep` from a dataset, to simplify\n\n Args:\n dataset: The dataset to use in the `Pipeline`.\n batch_size: The batch_size, will default to the same used by the `GeneratorStep`s.\n Defaults to `50`.\n input_mappings: Applies the same as any other step. Defaults to `None`.\n output_mappings: Applies the same as any other step. Defaults to `None`.\n resources: Applies the same as any other step. Defaults to `StepResources()`.\n repo_id: The repository ID to use in the `LoadDataFromHub` step.\n This shouldn't be necessary, but in case of error, the dataset will try to be loaded\n using `load_dataset` internally. If that case happens, the `repo_id` will be used.\n\n Raises:\n ValueError: If the format is different from the ones supported.\n\n Returns:\n A `LoadDataFromDicts` if the input is a list of dicts, or `LoadDataFromHub` instance\n if the input is a `pd.DataFrame` or a `Dataset`.\n \"\"\"\n from distilabel.steps import LoadDataFromDicts, LoadDataFromHub\n\n if isinstance(dataset, list):\n return LoadDataFromDicts(\n pipeline=pipeline,\n data=dataset,\n batch_size=batch_size,\n input_mappings=input_mappings or {},\n output_mappings=output_mappings or {},\n resources=resources,\n )\n\n if isinstance(dataset, pd.DataFrame):\n dataset = Dataset.from_pandas(dataset, preserve_index=False)\n\n if not isinstance(dataset, Dataset):\n raise DistilabelUserError(\n f\"Dataset type not allowed: {type(dataset)}, must be one of: \"\n \"`datasets.Dataset`, `pd.DataFrame`, `List[Dict[str, str]]`\",\n page=\"sections/how_to_guides/basic/pipeline/?h=make_#__tabbed_1_2\",\n )\n\n loader = LoadDataFromHub(\n pipeline=pipeline,\n repo_id=repo_id,\n batch_size=batch_size,\n input_mappings=input_mappings or {},\n output_mappings=output_mappings or {},\n resources=resources,\n )\n super(loader.__class__, loader).load() # Ensure the logger is loaded\n loader._dataset = dataset\n loader.num_examples = len(dataset)\n loader._dataset_info = {\"default\": dataset.info}\n return loader\n "},{"location":"api/step/global_step/","title":"GlobalStep","text":"This section contains the API reference for the GlobalStep class. For more information and examples on how to use existing global steps or create custom ones, please refer to Tutorial - Step - GlobalStep. "},{"location":"api/step/global_step/#distilabel.steps.base.GlobalStep","title":"GlobalStep ","text":" Bases: Step , ABC A special kind of Step which it's process method receives all the data processed by their previous steps at once, instead of receiving it in batches. This kind of steps are useful when the processing logic requires to have all the data at once, for example to train a model, to perform a global aggregation, etc. Source code in src/distilabel/steps/base.py class GlobalStep(Step, ABC):\n \"\"\"A special kind of `Step` which it's `process` method receives all the data processed\n by their previous steps at once, instead of receiving it in batches. This kind of steps\n are useful when the processing logic requires to have all the data at once, for example\n to train a model, to perform a global aggregation, etc.\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n return []\n "},{"location":"api/step/resources/","title":"StepResources","text":""},{"location":"api/step/resources/#distilabel.steps.base.StepResources","title":"StepResources ","text":" Bases: RuntimeParametersMixin , BaseModel A class to define the resources assigned to a _Step . Attributes: Name Type Description replicas RuntimeParameter[PositiveInt] The number of replicas for the step. cpus Optional[RuntimeParameter[PositiveInt]] The number of CPUs assigned to each step replica. gpus Optional[RuntimeParameter[PositiveInt]] The number of GPUs assigned to each step replica. memory Optional[RuntimeParameter[PositiveInt]] The memory in bytes required for each step replica. resources Optional[RuntimeParameter[Dict[str, int]]] A dictionary containing the number of custom resources required for each step replica. Source code in src/distilabel/steps/base.py class StepResources(RuntimeParametersMixin, BaseModel):\n \"\"\"A class to define the resources assigned to a `_Step`.\n\n Attributes:\n replicas: The number of replicas for the step.\n cpus: The number of CPUs assigned to each step replica.\n gpus: The number of GPUs assigned to each step replica.\n memory: The memory in bytes required for each step replica.\n resources: A dictionary containing the number of custom resources required for\n each step replica.\n \"\"\"\n\n replicas: RuntimeParameter[PositiveInt] = Field(\n default=1, description=\"The number of replicas for the step.\"\n )\n cpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The number of CPUs assigned to each step replica.\"\n )\n gpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The number of GPUs assigned to each step replica.\"\n )\n memory: Optional[RuntimeParameter[PositiveInt]] = Field(\n default=None, description=\"The memory in bytes required for each step replica.\"\n )\n resources: Optional[RuntimeParameter[Dict[str, int]]] = Field(\n default=None,\n description=\"A dictionary containing names of custom resources and the\"\n \" number of those resources required for each step replica.\",\n )\n "},{"location":"api/step/typing/","title":"Step Typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing","title":"typing ","text":""},{"location":"api/step/typing/#distilabel.steps.typing.StepOutput","title":"StepOutput = Iterator[List[Dict[str, Any]]] module-attribute ","text":"StepOutput is an alias of the typing Iterator[List[Dict[str, Any]]] "},{"location":"api/step/typing/#distilabel.steps.typing.GeneratorStepOutput","title":"GeneratorStepOutput = Iterator[Tuple[List[Dict[str, Any]], bool]] module-attribute ","text":"GeneratorStepOutput is an alias of the typing Iterator[Tuple[List[Dict[str, Any]], bool]] "},{"location":"api/step/typing/#distilabel.steps.typing.StepColumns","title":"StepColumns = Union[List[str], Dict[str, bool]] module-attribute ","text":"StepColumns is an alias of the typing Union[List[str], Dict[str, bool]] used by the inputs and outputs properties of an Step . In the case of a List[str] , it is a list with the required columns. In the case of a Dict[str, bool] , it is a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not. "},{"location":"api/step_gallery/argilla/","title":"Argilla","text":"This section contains the existing steps integrated with Argilla so as to easily push the generated datasets to Argilla. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base","title":"base ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase","title":"ArgillaBase ","text":" Bases: Step , ABC Abstract step that provides a class to subclass from, that contains the boilerplate code required to interact with Argilla, as well as some extra validations on top of it. It also defines the abstract methods that need to be implemented in order to add a new dataset type as a step. Note This class is not intended to be instanced directly, but via subclass. Attributes: Name Type Description dataset_name RuntimeParameter[str] The name of the dataset in Argilla where the records will be added. dataset_workspace Optional[RuntimeParameter[str]] The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url Optional[RuntimeParameter[str]] The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key Optional[RuntimeParameter[SecretStr]] The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters dataset_name : The name of the dataset in Argilla where the records will be added. dataset_workspace : The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - dynamic, based on the
inputs value provided Source code in src/distilabel/steps/argilla/base.py class ArgillaBase(Step, ABC):\n \"\"\"Abstract step that provides a class to subclass from, that contains the boilerplate code\n required to interact with Argilla, as well as some extra validations on top of it. It also defines\n the abstract methods that need to be implemented in order to add a new dataset type as a step.\n\n Note:\n This class is not intended to be instanced directly, but via subclass.\n\n Attributes:\n dataset_name: The name of the dataset in Argilla where the records will be added.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `dataset_name`: The name of the dataset in Argilla where the records will be\n added.\n - `dataset_workspace`: The workspace where the dataset will be created in Argilla.\n Defaults to `None`, which means it will be created in the default workspace.\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - dynamic, based on the `inputs` value provided\n \"\"\"\n\n dataset_name: RuntimeParameter[str] = Field(\n default=None, description=\"The name of the dataset in Argilla.\"\n )\n dataset_workspace: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The workspace where the dataset will be created in Argilla. Defaults \"\n \"to `None` which means it will be created in the default workspace.\",\n )\n\n api_url: Optional[RuntimeParameter[str]] = Field(\n default_factory=lambda: os.getenv(_ARGILLA_API_URL_ENV_VAR_NAME),\n description=\"The base URL to use for the Argilla API requests.\",\n )\n api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n default_factory=lambda: os.getenv(_ARGILLA_API_KEY_ENV_VAR_NAME),\n description=\"The API key to authenticate the requests to the Argilla API.\",\n )\n\n _client: Optional[\"Argilla\"] = PrivateAttr(...)\n _dataset: Optional[\"Dataset\"] = PrivateAttr(...)\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n super().model_post_init(__context)\n\n if importlib.util.find_spec(\"argilla\") is None:\n raise ImportError(\n \"Argilla is not installed. Please install it using `pip install argilla\"\n \" --upgrade`.\"\n )\n\n def _client_init(self) -> None:\n \"\"\"Initializes the Argilla API client with the provided `api_url` and `api_key`.\"\"\"\n try:\n self._client = rg.Argilla( # type: ignore\n api_url=self.api_url,\n api_key=self.api_key.get_secret_value(), # type: ignore\n headers={\"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\"}\n if isinstance(self.api_url, str)\n and \"hf.space\" in self.api_url\n and \"HF_TOKEN\" in os.environ\n else {},\n )\n except Exception as e:\n raise DistilabelUserError(\n f\"Failed to initialize the Argilla API: {e}\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n ) from e\n\n @property\n def _dataset_exists_in_workspace(self) -> bool:\n \"\"\"Checks if the dataset already exists in Argilla in the provided workspace if any.\n\n Returns:\n `True` if the dataset exists, `False` otherwise.\n \"\"\"\n return (\n self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace,\n )\n is not None\n )\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs of the step is an empty list, since the steps subclassing from this one, will\n always be leaf nodes and won't propagate the inputs neither generate any outputs.\n \"\"\"\n return []\n\n def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n super().load()\n\n if self.api_url is None or self.api_key is None:\n raise DistilabelUserError(\n \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n )\n\n self._client_init()\n\n @property\n @abstractmethod\n def inputs(self) -> \"StepColumns\": ...\n\n @abstractmethod\n def process(self, *inputs: StepInput) -> \"StepOutput\": ...\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.outputs","title":"outputs: StepColumns property ","text":"The outputs of the step is an empty list, since the steps subclassing from this one, will always be leaf nodes and won't propagate the inputs neither generate any outputs. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.model_post_init","title":"model_post_init(__context) ","text":"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings. Source code in src/distilabel/steps/argilla/base.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n super().model_post_init(__context)\n\n if importlib.util.find_spec(\"argilla\") is None:\n raise ImportError(\n \"Argilla is not installed. Please install it using `pip install argilla\"\n \" --upgrade`.\"\n )\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.load","title":"load() ","text":"Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc. Source code in src/distilabel/steps/argilla/base.py def load(self) -> None:\n \"\"\"Method to perform any initialization logic before the `process` method is\n called. For example, to load an LLM, stablish a connection to a database, etc.\n \"\"\"\n super().load()\n\n if self.api_url is None or self.api_key is None:\n raise DistilabelUserError(\n \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n page=\"sections/how_to_guides/advanced/argilla/\",\n )\n\n self._client_init()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference","title":"preference ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla","title":"PreferenceToArgilla ","text":" Bases: ArgillaBase Creates a preference dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations. Note This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations , as the ratings and rationales are optional. Attributes: Name Type Description num_generations int The number of generations to include in the dataset. dataset_name int The name of the dataset in Argilla. dataset_workspace int The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url int The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key int The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - instruction (
str ): The instruction that was used to generate the completion. - generations (
List[str] ): The completion that was generated based on the input instruction. - ratings (
List[str] , optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla. - rationales (
List[str] , optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla. Examples: Push a preference dataset to an Argilla instance: from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n It can also include ratings and rationales: result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'instruction',\n# 'generations': ['first_generation', 'second_generation'],\n# 'ratings': ['4', '5'],\n# 'rationales': ['rationale for 4', 'rationale for 5']\n# }\n# ]\n Source code in src/distilabel/steps/argilla/preference.py class PreferenceToArgilla(ArgillaBase):\n \"\"\"Creates a preference dataset in Argilla.\n\n Step that creates a dataset in Argilla during the load phase, and then pushes the input\n batches into it as records. This dataset is a preference dataset, where there's one field\n for the instruction and one extra field per each generation within the same record, and then\n a rating question per each of the generation fields. The rating question asks the annotator to\n set a rating from 1 to 5 for each of the provided generations.\n\n Note:\n This step is meant to be used in conjunction with the `UltraFeedback` step, or any other step\n generating both ratings and responses for a given set of instruction and generations for the\n given instruction. But alternatively, it can also be used with any other task or step generating\n only the `instruction` and `generations`, as the `ratings` and `rationales` are optional.\n\n Attributes:\n num_generations: The number of generations to include in the dataset.\n dataset_name: The name of the dataset in Argilla.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the completion.\n - generations (`List[str]`): The completion that was generated based on the input instruction.\n - ratings (`List[str]`, optional): The ratings for the generations. If not provided, the\n generated ratings won't be pushed to Argilla.\n - rationales (`List[str]`, optional): The rationales for the ratings. If not provided, the\n generated rationales won't be pushed to Argilla.\n\n Examples:\n Push a preference dataset to an Argilla instance:\n\n ```python\n from distilabel.steps import PreferenceToArgilla\n\n to_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n )\n to_argilla.load()\n\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n ```\n\n It can also include ratings and rationales:\n\n ```python\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n )\n # >>> result\n # [\n # {\n # 'instruction': 'instruction',\n # 'generations': ['first_generation', 'second_generation'],\n # 'ratings': ['4', '5'],\n # 'rationales': ['rationale for 4', 'rationale for 5']\n # }\n # ]\n ```\n \"\"\"\n\n num_generations: int\n\n _id: str = PrivateAttr(default=\"id\")\n _instruction: str = PrivateAttr(...)\n _generations: str = PrivateAttr(...)\n _ratings: str = PrivateAttr(...)\n _rationales: str = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n # Both `instruction` and `generations` will be used as the fields of the dataset\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generations = self.input_mappings.get(\"generations\", \"generations\")\n # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField):\n continue\n if (\n field.name\n not in [self._id, self._instruction] # type: ignore\n + [\n f\"{self._generations}-{idx}\"\n for idx in range(self.num_generations)\n ]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n page=\"components-gallery/steps/preferencetoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n *self._generation_fields(), # type: ignore\n ],\n questions=self._rating_rationale_pairs(), # type: ignore\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n\n def _generation_fields(self) -> List[\"TextField\"]:\n \"\"\"Method to generate the fields for each of the generations.\n\n Returns:\n A list containing `TextField`s for each text generation.\n \"\"\"\n return [\n rg.TextField( # type: ignore\n name=f\"{self._generations}-{idx}\",\n title=f\"{self._generations}-{idx}\",\n required=True if idx == 0 else False,\n )\n for idx in range(self.num_generations)\n ]\n\n def _rating_rationale_pairs(\n self,\n ) -> List[Union[\"RatingQuestion\", \"TextQuestion\"]]:\n \"\"\"Method to generate the rating and rationale questions for each of the generations.\n\n Returns:\n A list of questions containing a `RatingQuestion` and `TextQuestion` pair for\n each text generation.\n \"\"\"\n questions = []\n for idx in range(self.num_generations):\n questions.extend(\n [\n rg.RatingQuestion( # type: ignore\n name=f\"{self._generations}-{idx}-rating\",\n title=f\"Rate {self._generations}-{idx} given {self._instruction}.\",\n description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n if idx != 0\n else None,\n values=[1, 2, 3, 4, 5],\n required=True if idx == 0 else False,\n ),\n rg.TextQuestion( # type: ignore\n name=f\"{self._generations}-{idx}-rationale\",\n title=f\"Specify the rationale for {self._generations}-{idx}'s rating.\",\n description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n if idx != 0\n else None,\n required=False,\n ),\n ]\n )\n return questions\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the step are the `instruction` and the `generations`. Optionally, one could also\n provide the `ratings` and the `rationales` for the generations.\"\"\"\n return [\"instruction\", \"generations\"]\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"The optional inputs for the step are the `ratings` and the `rationales` for the generations.\"\"\"\n return [\"ratings\", \"rationales\"]\n\n def _add_suggestions_if_any(self, input: Dict[str, Any]) -> List[\"Suggestion\"]:\n \"\"\"Method to generate the suggestions for the `rg.Record` based on the input.\n\n Returns:\n A list of `Suggestion`s for the rating and rationales questions.\n \"\"\"\n # Since the `suggestions` i.e. answers to the `questions` are optional, will default to {}\n suggestions = []\n # If `ratings` is in `input`, then add those as suggestions\n if self._ratings in input:\n suggestions.extend(\n [\n rg.Suggestion( # type: ignore\n value=rating,\n question_name=f\"{self._generations}-{idx}-rating\",\n )\n for idx, rating in enumerate(input[self._ratings])\n if rating is not None\n and isinstance(rating, int)\n and rating in [1, 2, 3, 4, 5]\n ],\n )\n # If `rationales` is in `input`, then add those as suggestions\n if self._rationales in input:\n suggestions.extend(\n [\n rg.Suggestion( # type: ignore\n value=rationale,\n question_name=f\"{self._generations}-{idx}-rationale\",\n )\n for idx, rationale in enumerate(input[self._rationales])\n if rationale is not None and isinstance(rationale, str)\n ],\n )\n return suggestions\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n generations = {\n f\"{self._generations}-{idx}\": generation\n for idx, generation in enumerate(input[\"generations\"]) # type: ignore\n }\n\n records.append( # type: ignore\n rg.Record( # type: ignore\n fields={\n \"id\": instruction_id,\n \"instruction\": input[\"instruction\"], # type: ignore\n **generations,\n },\n suggestions=self._add_suggestions_if_any(input), # type: ignore\n )\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.inputs","title":"inputs: List[str] property ","text":"The inputs for the step are the instruction and the generations . Optionally, one could also provide the ratings and the rationales for the generations. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.optional_inputs","title":"optional_inputs: List[str] property ","text":"The optional inputs for the step are the ratings and the rationales for the generations. "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.load","title":"load() ","text":"Sets the _instruction and _generations attributes based on the inputs_mapping , otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla. Source code in src/distilabel/steps/argilla/preference.py def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n # Both `instruction` and `generations` will be used as the fields of the dataset\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generations = self.input_mappings.get(\"generations\", \"generations\")\n # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField):\n continue\n if (\n field.name\n not in [self._id, self._instruction] # type: ignore\n + [\n f\"{self._generations}-{idx}\"\n for idx in range(self.num_generations)\n ]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n page=\"components-gallery/steps/preferencetoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n *self._generation_fields(), # type: ignore\n ],\n questions=self._rating_rationale_pairs(), # type: ignore\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.process","title":"process(inputs) ","text":"Creates and pushes the records as rg.Record s to the Argilla dataset. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/argilla/preference.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n generations = {\n f\"{self._generations}-{idx}\": generation\n for idx, generation in enumerate(input[\"generations\"]) # type: ignore\n }\n\n records.append( # type: ignore\n rg.Record( # type: ignore\n fields={\n \"id\": instruction_id,\n \"instruction\": input[\"instruction\"], # type: ignore\n **generations,\n },\n suggestions=self._add_suggestions_if_any(input), # type: ignore\n )\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation","title":"text_generation ","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla","title":"TextGenerationToArgilla ","text":" Bases: ArgillaBase Creates a text generation dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d). Note This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns. Attributes: Name Type Description dataset_name The name of the dataset in Argilla. dataset_workspace The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. api_url The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. api_key The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. Runtime parameters api_url : The base URL to use for the Argilla API requests. api_key : The API key to authenticate the requests to the Argilla API. Input columns - instruction (
str ): The instruction that was used to generate the completion. - generation (
str or List[str] ): The completions that were generated based on the input instruction. Examples: Push a text generation dataset to an Argilla instance: from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n Source code in src/distilabel/steps/argilla/text_generation.py class TextGenerationToArgilla(ArgillaBase):\n \"\"\"Creates a text generation dataset in Argilla.\n\n `Step` that creates a dataset in Argilla during the load phase, and then pushes the input\n batches into it as records. This dataset is a text-generation dataset, where there's one field\n per each input, and then a label question to rate the quality of the completion in either bad\n (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).\n\n Note:\n This step is meant to be used in conjunction with a `TextGeneration` step and no column mapping\n is needed, as it will use the default values for the `instruction` and `generation` columns.\n\n Attributes:\n dataset_name: The name of the dataset in Argilla.\n dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n `None`, which means it will be created in the default workspace.\n api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n the `ARGILLA_API_URL` environment variable.\n api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n be read from the `ARGILLA_API_KEY` environment variable.\n\n Runtime parameters:\n - `api_url`: The base URL to use for the Argilla API requests.\n - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the completion.\n - generation (`str` or `List[str]`): The completions that were generated based on the input instruction.\n\n Examples:\n Push a text generation dataset to an Argilla instance:\n\n ```python\n from distilabel.steps import PreferenceToArgilla\n\n to_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n )\n to_argilla.load()\n\n result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction', 'generation': 'generation'}]\n ```\n \"\"\"\n\n _id: str = PrivateAttr(default=\"id\")\n _instruction: str = PrivateAttr(...)\n _generation: str = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField): # type: ignore\n continue\n if (\n field.name not in [self._id, self._instruction, self._generation]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n \" so it cannot be reused for this dataset.\",\n page=\"components-gallery/steps/textgenerationtoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n rg.TextField(name=self._generation, title=self._generation), # type: ignore\n ],\n questions=[\n rg.LabelQuestion( # type: ignore\n name=\"quality\",\n title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"}, # type: ignore\n )\n ],\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the step are the `instruction` and the `generation`.\"\"\"\n return [\"instruction\", \"generation\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\")\n ).hexdigest()\n\n generations = input[\"generation\"]\n\n # If the `generation` is not a list, then convert it into a list\n if not isinstance(generations, list):\n generations = [generations]\n\n # Create a `generations_set` to avoid adding duplicates\n generations_set = set()\n\n for generation in generations:\n # If the generation is already in the set, then skip it\n if generation in generations_set:\n continue\n # Otherwise, add it to the set\n generations_set.add(generation)\n\n records.append(\n rg.Record( # type: ignore\n fields={\n self._id: instruction_id,\n self._instruction: input[\"instruction\"],\n self._generation: generation,\n },\n ),\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.inputs","title":"inputs: List[str] property ","text":"The inputs for the step are the instruction and the generation . "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.load","title":"load() ","text":"Sets the _instruction and _generation attributes based on the inputs_mapping , otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla. Source code in src/distilabel/steps/argilla/text_generation.py def load(self) -> None:\n \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n the text-generation scenario. And then it pushes it to Argilla.\n \"\"\"\n super().load()\n\n self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n if self._dataset_exists_in_workspace:\n _dataset = self._client.datasets( # type: ignore\n name=self.dataset_name, # type: ignore\n workspace=self.dataset_workspace, # type: ignore\n )\n\n for field in _dataset.fields:\n if not isinstance(field, rg.TextField): # type: ignore\n continue\n if (\n field.name not in [self._id, self._instruction, self._generation]\n and field.required\n ):\n raise DistilabelUserError(\n f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n f\" already exists, but contains at least a required field that is\"\n f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n \" so it cannot be reused for this dataset.\",\n page=\"components-gallery/steps/textgenerationtoargilla/\",\n )\n\n self._dataset = _dataset\n else:\n _settings = rg.Settings( # type: ignore\n fields=[\n rg.TextField(name=self._id, title=self._id), # type: ignore\n rg.TextField(name=self._instruction, title=self._instruction), # type: ignore\n rg.TextField(name=self._generation, title=self._generation), # type: ignore\n ],\n questions=[\n rg.LabelQuestion( # type: ignore\n name=\"quality\",\n title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"}, # type: ignore\n )\n ],\n )\n _dataset = rg.Dataset( # type: ignore\n name=self.dataset_name,\n workspace=self.dataset_workspace,\n settings=_settings,\n client=self._client,\n )\n self._dataset = _dataset.create()\n "},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.process","title":"process(inputs) ","text":"Creates and pushes the records as FeedbackRecords to the Argilla dataset. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/argilla/text_generation.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n records = []\n for input in inputs:\n # Generate the SHA-256 hash of the instruction to use it as the metadata\n instruction_id = hashlib.sha256(\n input[\"instruction\"].encode(\"utf-8\")\n ).hexdigest()\n\n generations = input[\"generation\"]\n\n # If the `generation` is not a list, then convert it into a list\n if not isinstance(generations, list):\n generations = [generations]\n\n # Create a `generations_set` to avoid adding duplicates\n generations_set = set()\n\n for generation in generations:\n # If the generation is already in the set, then skip it\n if generation in generations_set:\n continue\n # Otherwise, add it to the set\n generations_set.add(generation)\n\n records.append(\n rg.Record( # type: ignore\n fields={\n self._id: instruction_id,\n self._instruction: input[\"instruction\"],\n self._generation: generation,\n },\n ),\n )\n self._dataset.records.log(records) # type: ignore\n yield inputs\n "},{"location":"api/step_gallery/columns/","title":"Columns","text":"This section contains the existing steps intended to be used for common column operations to apply to the batches. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand","title":"expand ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns","title":"ExpandColumns ","text":" Bases: Step Expand columns that contain lists into multiple rows. ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list. Attributes: Name Type Description columns Union[Dict[str, str], List[str]] A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name. encoded Union[bool, List[str]] A bool to inform Whether the columns are JSON encoded lists. If this value is set to True, the columns will be decoded before expanding. Alternatively, to specify columns that can be encoded, a list can be provided. In this case, the column names informed must be a subset of the columns selected for expansion. split_statistics bool A bool to inform whether the statistics in the distilabel_metadata column should be split into multiple rows. If we want to expand some columns containing a list of strings that come from having parsed the output of an LLM, the tokens in the statistics_{step_name} of the distilabel_metadata column should be splitted to avoid multiplying them if we aggregate the data afterwards. For example, with a task that is supposed to generate a list of N instructions, and we want each of those N instructions in different rows, we should split the statistics by N. In such a case, set this value to True. Input columns - dynamic (determined by
columns attribute): The columns to be expanded into multiple rows. Output columns - dynamic (determined by
columns attribute): The expanded columns. Categories Examples: Expand the selected columns into multiple rows: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n Expand the selected columns which are JSON encoded into multiple rows: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n Expand the selected columns and split the statistics in the distilabel_metadata column: from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n Source code in src/distilabel/steps/columns/expand.py class ExpandColumns(Step):\n \"\"\"Expand columns that contain lists into multiple rows.\n\n `ExpandColumns` is a `Step` that takes a list of columns and expands them into multiple\n rows. The new rows will have the same data as the original row, except for the expanded\n column, which will contain a single item from the original list.\n\n Attributes:\n columns: A dictionary that maps the column to be expanded to the new column name\n or a list of columns to be expanded. If a list is provided, the new column name\n will be the same as the column name.\n encoded: A bool to inform Whether the columns are JSON encoded lists. If this value is\n set to True, the columns will be decoded before expanding. Alternatively, to specify\n columns that can be encoded, a list can be provided. In this case, the column names\n informed must be a subset of the columns selected for expansion.\n split_statistics: A bool to inform whether the statistics in the `distilabel_metadata`\n column should be split into multiple rows.\n If we want to expand some columns containing a list of strings that come from\n having parsed the output of an LLM, the tokens in the `statistics_{step_name}`\n of the `distilabel_metadata` column should be splitted to avoid multiplying\n them if we aggregate the data afterwards. For example, with a task that is supposed\n to generate a list of N instructions, and we want each of those N instructions in\n different rows, we should split the statistics by N.\n In such a case, set this value to True.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to be expanded into\n multiple rows.\n\n Output columns:\n - dynamic (determined by `columns` attribute): The expanded columns.\n\n Categories:\n - columns\n\n Examples:\n Expand the selected columns into multiple rows:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n ```\n\n Expand the selected columns which are JSON encoded into multiple rows:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n ```\n\n Expand the selected columns and split the statistics in the `distilabel_metadata` column:\n\n ```python\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n )\n expand_columns.load()\n\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n ```\n \"\"\"\n\n columns: Union[Dict[str, str], List[str]]\n encoded: Union[bool, List[str]] = False\n split_statistics: bool = False\n\n @field_validator(\"columns\")\n @classmethod\n def always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n \"\"\"Ensure that the columns are always a dictionary.\n\n Args:\n value: The columns to be expanded.\n\n Returns:\n The columns to be expanded as a dictionary.\n \"\"\"\n if isinstance(value, list):\n return {col: col for col in value}\n\n return value\n\n @model_validator(mode=\"after\")\n def is_subset(self) -> Self:\n \"\"\"Ensure the \"encoded\" column names are a subset of the \"columns\" selected.\n\n Returns:\n The \"encoded\" attribute updated to work internally.\n \"\"\"\n if isinstance(self.encoded, list):\n if not set(self.encoded).issubset(set(self.columns.keys())):\n raise ValueError(\n \"The 'encoded' columns must be a subset of the 'columns' selected for expansion.\"\n )\n if isinstance(self.encoded, bool):\n self.encoded = list(self.columns.keys()) if self.encoded else []\n return self\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The columns to be expanded.\"\"\"\n return list(self.columns.keys())\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The expanded columns.\"\"\"\n return [\n new_column if new_column else expand_column\n for expand_column, new_column in self.columns.items()\n ]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Expand the columns in the input data.\n\n Args:\n inputs: The input data.\n\n Yields:\n The expanded rows.\n \"\"\"\n if self.encoded:\n for input in inputs:\n for column in self.encoded:\n input[column] = json.loads(input[column])\n\n yield [row for input in inputs for row in self._expand_columns(input)]\n\n def _expand_columns(self, input: Dict[str, Any]) -> List[Dict[str, Any]]:\n \"\"\"Expand the columns in the input data.\n\n Args:\n input: The input data.\n\n Returns:\n The expanded rows.\n \"\"\"\n metadata_visited = False\n expanded_rows = []\n # Update the columns here to avoid doing the validation on the `inputs`, as the\n # `distilabel_metadata` is not defined on Pipeline creation on the DAG.\n columns = self.columns\n if self.split_statistics:\n columns[\"distilabel_metadata\"] = \"distilabel_metadata\"\n\n for expand_column, new_column in columns.items(): # type: ignore\n data = input.get(expand_column)\n input, metadata_visited = self._split_metadata(\n input, len(data), metadata_visited\n )\n\n rows = []\n for item, expanded in zip_longest(*[data, expanded_rows], fillvalue=input):\n rows.append({**expanded, new_column: item})\n expanded_rows = rows\n return expanded_rows\n\n def _split_metadata(\n self, input: Dict[str, Any], n: int, metadata_visited: bool = False\n ) -> None:\n \"\"\"Help method to split the statistics in `distilabel_metadata` column.\n\n Args:\n input: The input data.\n n: Number of splits to apply to the tokens (if we have 12 tokens and want to split\n them 3 times, n==3).\n metadata_visited: Bool to prevent from updating the data more than once.\n\n Returns:\n Updated input with the `distilabel_metadata` updated.\n \"\"\"\n # - If we want to split the statistics, we need to ensure that the metadata is present.\n # - Metadata can only be visited once per row to avoid successive splitting.\n # TODO: For an odd number of tokens, this will miss 1, we have to fix it.\n if (\n self.split_statistics\n and (metadata := input.get(\"distilabel_metadata\", {}))\n and not metadata_visited\n ):\n for k, v in metadata.items():\n if (\n not v\n ): # In case it wasn't found in the metadata for some error, skip it\n continue\n if k.startswith(\"statistics_\") and (\n \"input_tokens\" in v and \"output_tokens\" in v\n ):\n # For num_generations>1 we assume all the tokens should be divided by n\n # TODO: The tokens should always come as a list, but there can\n # be differences\n if isinstance(v[\"input_tokens\"], list):\n input_tokens = [value // n for value in v[\"input_tokens\"]]\n else:\n input_tokens = [v[\"input_tokens\"] // n]\n if isinstance(v[\"input_tokens\"], list):\n output_tokens = [value // n for value in v[\"output_tokens\"]]\n else:\n output_tokens = [v[\"output_tokens\"] // n]\n\n input[\"distilabel_metadata\"][k] = {\n \"input_tokens\": input_tokens,\n \"output_tokens\": output_tokens,\n }\n metadata_visited = True\n # Once we have updated the metadata, Create a list out of it to let the\n # following section to expand it as any other column.\n if isinstance(input[\"distilabel_metadata\"], dict):\n input[\"distilabel_metadata\"] = [input[\"distilabel_metadata\"]] * n\n return input, metadata_visited\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.inputs","title":"inputs: StepColumns property ","text":"The columns to be expanded. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.outputs","title":"outputs: StepColumns property ","text":"The expanded columns. "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.always_dict","title":"always_dict(value) classmethod ","text":"Ensure that the columns are always a dictionary. Parameters: Name Type Description Default value Union[Dict[str, str], List[str]] The columns to be expanded. required Returns: Type Description Dict[str, str] The columns to be expanded as a dictionary. Source code in src/distilabel/steps/columns/expand.py @field_validator(\"columns\")\n@classmethod\ndef always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n \"\"\"Ensure that the columns are always a dictionary.\n\n Args:\n value: The columns to be expanded.\n\n Returns:\n The columns to be expanded as a dictionary.\n \"\"\"\n if isinstance(value, list):\n return {col: col for col in value}\n\n return value\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.is_subset","title":"is_subset() ","text":"Ensure the \"encoded\" column names are a subset of the \"columns\" selected. Returns: Type Description Self The \"encoded\" attribute updated to work internally. Source code in src/distilabel/steps/columns/expand.py @model_validator(mode=\"after\")\ndef is_subset(self) -> Self:\n \"\"\"Ensure the \"encoded\" column names are a subset of the \"columns\" selected.\n\n Returns:\n The \"encoded\" attribute updated to work internally.\n \"\"\"\n if isinstance(self.encoded, list):\n if not set(self.encoded).issubset(set(self.columns.keys())):\n raise ValueError(\n \"The 'encoded' columns must be a subset of the 'columns' selected for expansion.\"\n )\n if isinstance(self.encoded, bool):\n self.encoded = list(self.columns.keys()) if self.encoded else []\n return self\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.process","title":"process(inputs) ","text":"Expand the columns in the input data. Parameters: Name Type Description Default inputs StepInput The input data. required Yields: Type Description StepOutput The expanded rows. Source code in src/distilabel/steps/columns/expand.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Expand the columns in the input data.\n\n Args:\n inputs: The input data.\n\n Yields:\n The expanded rows.\n \"\"\"\n if self.encoded:\n for input in inputs:\n for column in self.encoded:\n input[column] = json.loads(input[column])\n\n yield [row for input in inputs for row in self._expand_columns(input)]\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep","title":"keep ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns","title":"KeepColumns ","text":" Bases: Step Keeps selected columns in the dataset. KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs . Note The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to keep. Input columns - dynamic (determined by
columns attribute): The columns to keep. Output columns - dynamic (determined by
columns attribute): The columns that were kept. Categories Examples: Select the columns to keep: from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n Source code in src/distilabel/steps/columns/keep.py class KeepColumns(Step):\n \"\"\"Keeps selected columns in the dataset.\n\n `KeepColumns` is a `Step` that implements the `process` method that keeps only the columns\n specified in the `columns` attribute. Also `KeepColumns` provides an attribute `columns` to\n specify the columns to keep which will override the default value for the properties `inputs`\n and `outputs`.\n\n Note:\n The order in which the columns are provided is important, as the output will be sorted\n using the provided order, which is useful before pushing either a `dataset.Dataset` via\n the `PushToHub` step or a `distilabel.Distiset` via the `Pipeline.run` output variable.\n\n Attributes:\n columns: List of strings with the names of the columns to keep.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to keep.\n\n Output columns:\n - dynamic (determined by `columns` attribute): The columns that were kept.\n\n Categories:\n - columns\n\n Examples:\n Select the columns to keep:\n\n ```python\n from distilabel.steps import KeepColumns\n\n keep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n )\n keep_columns.load()\n\n result = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n )\n # >>> result\n # [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n ```\n \"\"\"\n\n columns: List[str]\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @override\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n Args:\n *inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n outputs = []\n for item in input:\n outputs.append({col: item[col] for col in self.columns})\n yield outputs\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.process","title":"process(*inputs) ","text":"The process method keeps only the columns specified in the columns attribute. Parameters: Name Type Description Default *inputs StepInput A list of dictionaries with the input data. () Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/columns/keep.py @override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n Args:\n *inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n outputs = []\n for item in input:\n outputs.append({col: item[col] for col in self.columns})\n yield outputs\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge","title":"merge ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge.MergeColumns","title":"MergeColumns ","text":" Bases: Step Merge columns from a row. MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput . MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column. This step can be useful if you have a Task that generates instructions for example, and you want to have more examples of those. In such a case, you could for example use another Task to multiply your instructions synthetically, what would yield two different columns splitted. Using MergeColumns you can merge them and use them as a single column in your dataset for further processing. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to merge. output_column Optional[str] str name of the output column Input columns - dynamic (determined by
columns attribute): The columns to merge. Output columns - dynamic (determined by
columns and output_column attributes): The columns that were merged. Categories Examples: Combine columns in rows of a dataset: from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n Source code in src/distilabel/steps/columns/merge.py class MergeColumns(Step):\n \"\"\"Merge columns from a row.\n\n `MergeColumns` is a `Step` that implements the `process` method that calls the `merge_columns`\n function to handle and combine columns in a `StepInput`. `MergeColumns` provides two attributes\n `columns` and `output_column` to specify the columns to merge and the resulting output column.\n\n This step can be useful if you have a `Task` that generates instructions for example, and you\n want to have more examples of those. In such a case, you could for example use another `Task`\n to multiply your instructions synthetically, what would yield two different columns splitted.\n Using `MergeColumns` you can merge them and use them as a single column in your dataset for\n further processing.\n\n Attributes:\n columns: List of strings with the names of the columns to merge.\n output_column: str name of the output column\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to merge.\n\n Output columns:\n - dynamic (determined by `columns` and `output_column` attributes): The columns\n that were merged.\n\n Categories:\n - columns\n\n Examples:\n Combine columns in rows of a dataset:\n\n ```python\n from distilabel.steps import MergeColumns\n\n combiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n )\n combiner.load()\n\n result = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n )\n # >>> result\n # [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n ```\n \"\"\"\n\n columns: List[str]\n output_column: Optional[str] = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [self.output_column] if self.output_column else [\"merged_column\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n combined = []\n for input in inputs:\n combined.append(\n merge_columns(\n input,\n columns=self.columns,\n new_column=self.outputs[0],\n )\n )\n yield combined\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group","title":"group ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns","title":"GroupColumns ","text":" Bases: Step Combines columns from a list of StepInput . GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput . Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs , respectively. Attributes: Name Type Description columns List[str] List of strings with the names of the columns to group. output_columns Optional[List[str]] Optional list of strings with the names of the output columns. Input columns - dynamic (determined by
columns attribute): The columns to group. Output columns - dynamic (determined by
columns and output_columns attributes): The columns that were grouped. Categories Examples: Group columns of a dataset:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n```\n\nSpecify the name of the output columns:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n```\n Source code in src/distilabel/steps/columns/group.py class GroupColumns(Step):\n \"\"\"Combines columns from a list of `StepInput`.\n\n `GroupColumns` is a `Step` that implements the `process` method that calls the `group_dicts`\n function to handle and combine a list of `StepInput`. Also `GroupColumns` provides two attributes\n `columns` and `output_columns` to specify the columns to group and the output columns\n which will override the default value for the properties `inputs` and `outputs`, respectively.\n\n Attributes:\n columns: List of strings with the names of the columns to group.\n output_columns: Optional list of strings with the names of the output columns.\n\n Input columns:\n - dynamic (determined by `columns` attribute): The columns to group.\n\n Output columns:\n - dynamic (determined by `columns` and `output_columns` attributes): The columns\n that were grouped.\n\n Categories:\n - columns\n\n Examples:\n\n Group columns of a dataset:\n\n ```python\n from distilabel.steps import GroupColumns\n\n group_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n )\n group_columns.load()\n\n result = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n )\n # >>> result\n # [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n ```\n\n Specify the name of the output columns:\n\n ```python\n from distilabel.steps import GroupColumns\n\n group_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n )\n group_columns.load()\n\n result = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n )\n # >>> result\n #[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n ```\n \"\"\"\n\n columns: List[str]\n output_columns: Optional[List[str]] = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n return self.columns\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task are the column names in `output_columns` or\n `grouped_{column}` for each column in `columns`.\"\"\"\n return (\n self.output_columns\n if self.output_columns is not None\n else [f\"grouped_{column}\" for column in self.columns]\n )\n\n @override\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n \"\"\"\n yield group_columns(\n *inputs,\n group_columns=self.inputs,\n output_group_columns=self.outputs,\n )\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are the column names in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task are the column names in output_columns or grouped_{column} for each column in columns . "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.process","title":"process(*inputs) ","text":"The process method calls the group_dicts function to handle and combine a list of StepInput . Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with the combined StepInput using the group_dicts function. Source code in src/distilabel/steps/columns/group.py @override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n \"\"\"\n yield group_columns(\n *inputs,\n group_columns=self.inputs,\n output_group_columns=self.outputs,\n )\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.CombineColumns","title":"CombineColumns ","text":" Bases: GroupColumns CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. Source code in src/distilabel/steps/columns/group.py class CombineColumns(GroupColumns):\n \"\"\"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\"\"\"\n\n def __init__(self, **data: Any) -> None:\n warnings.warn(\n \"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\",\n DeprecationWarning,\n stacklevel=2,\n )\n return super().__init__(**data)\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils","title":"utils ","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_distilabel_metadata","title":"merge_distilabel_metadata(*output_dicts) ","text":"Merge the DISTILABEL_METADATA_KEY from multiple output dictionaries. Parameters: Name Type Description Default *output_dicts Dict[str, Any] Variable number of dictionaries containing distilabel metadata. () Returns: Type Description Dict[str, Any] A merged dictionary containing all the distilabel metadata from the input dictionaries. Source code in src/distilabel/steps/columns/utils.py def merge_distilabel_metadata(*output_dicts: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"\n Merge the `DISTILABEL_METADATA_KEY` from multiple output dictionaries.\n\n Args:\n *output_dicts: Variable number of dictionaries containing distilabel metadata.\n\n Returns:\n A merged dictionary containing all the distilabel metadata from the input dictionaries.\n \"\"\"\n merged_metadata = defaultdict(list)\n\n for output_dict in output_dicts:\n metadata = output_dict.get(DISTILABEL_METADATA_KEY, {})\n for key, value in metadata.items():\n merged_metadata[key].append(value)\n\n final_metadata = {}\n for key, value_list in merged_metadata.items():\n if len(value_list) == 1:\n final_metadata[key] = value_list[0]\n else:\n final_metadata[key] = value_list\n\n return final_metadata\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.group_columns","title":"group_columns(*inputs, group_columns, output_group_columns=None) ","text":"Groups multiple list of dictionaries into a single list of dictionaries on the specified group_columns . If group_columns are provided, then it will also rename group_columns . Parameters: Name Type Description Default inputs StepInput list of dictionaries to combine. () group_columns List[str] list of keys to merge on. required output_group_columns Optional[List[str]] list of keys to rename the merge keys to. Defaults to None . None Returns: Type Description StepInput A list of dictionaries where the values of the group_columns are combined into a StepInput list and renamed to output_group_columns . Source code in src/distilabel/steps/columns/utils.py def group_columns(\n *inputs: \"StepInput\",\n group_columns: List[str],\n output_group_columns: Optional[List[str]] = None,\n) -> \"StepInput\":\n \"\"\"Groups multiple list of dictionaries into a single list of dictionaries on the\n specified `group_columns`. If `group_columns` are provided, then it will also rename\n `group_columns`.\n\n Args:\n inputs: list of dictionaries to combine.\n group_columns: list of keys to merge on.\n output_group_columns: list of keys to rename the merge keys to. Defaults to `None`.\n\n Returns:\n A list of dictionaries where the values of the `group_columns` are combined into a\n list and renamed to `output_group_columns`.\n \"\"\"\n if output_group_columns is not None and len(output_group_columns) != len(\n group_columns\n ):\n raise ValueError(\n \"The length of `output_group_columns` must be the same as the length of `group_columns`.\"\n )\n if output_group_columns is None:\n output_group_columns = [f\"grouped_{key}\" for key in group_columns]\n group_columns_dict = dict(zip(group_columns, output_group_columns))\n\n result = []\n # Use zip to iterate over lists based on their index\n for dicts_at_index in zip(*inputs):\n combined_dict = {}\n metadata_dicts = []\n # Iterate over dicts at the same index\n for d in dicts_at_index:\n # Extract metadata for merging\n if DISTILABEL_METADATA_KEY in d:\n metadata_dicts.append(\n {DISTILABEL_METADATA_KEY: d[DISTILABEL_METADATA_KEY]}\n )\n # Iterate over key-value pairs in each dict\n for key, value in d.items():\n if key == DISTILABEL_METADATA_KEY:\n continue\n # If the key is in the merge_keys, append the value to the existing list\n if key in group_columns_dict.keys():\n combined_dict.setdefault(group_columns_dict[key], []).append(value)\n # If the key is not in the merge_keys, create a new key-value pair\n else:\n combined_dict[key] = value\n\n if metadata_dicts:\n combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n *metadata_dicts\n )\n\n result.append(combined_dict)\n return result\n "},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_columns","title":"merge_columns(row, columns, new_column='combined_key') ","text":"Merge columns in a dictionary into a single column on the specified new_column . Parameters: Name Type Description Default row Dict[str, Any] Dictionary corresponding to a row in a dataset. required columns List[str] List of keys to merge. required new_column str Name of the new key created. 'combined_key' Returns: Type Description Dict[str, Any] Dictionary with the new merged key. Source code in src/distilabel/steps/columns/utils.py def merge_columns(\n row: Dict[str, Any], columns: List[str], new_column: str = \"combined_key\"\n) -> Dict[str, Any]:\n \"\"\"Merge columns in a dictionary into a single column on the specified `new_column`.\n\n Args:\n row: Dictionary corresponding to a row in a dataset.\n columns: List of keys to merge.\n new_column: Name of the new key created.\n\n Returns:\n Dictionary with the new merged key.\n \"\"\"\n result = row.copy() # preserve the original dictionary\n combined = []\n for key in columns:\n to_combine = result.pop(key)\n if not isinstance(to_combine, list):\n to_combine = [to_combine]\n combined += to_combine\n result[new_column] = combined\n return result\n "},{"location":"api/step_gallery/extra/","title":"Extra","text":""},{"location":"api/step_gallery/extra/#distilabel.steps","title":"steps ","text":""},{"location":"api/step_gallery/extra/#distilabel.steps.DBSCAN","title":"DBSCAN ","text":" Bases: GlobalStep DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density. This is a GlobalStep that clusters the embeddings using the DBSCAN algorithm from sklearn . Visit TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. Input columns - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. Output columns - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. Categories - clustering
- text-classification
References DBSCAN demo of sklearn sklearn dbscan Attributes: Name Type Description - eps The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs The number of parallel jobs to run. Runtime parameters eps : The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. metric : The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. n_jobs : The number of parallel jobs to run. Source code in src/distilabel/steps/clustering/dbscan.py class DBSCAN(GlobalStep):\n r\"\"\"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core\n samples in regions of high density and expands clusters from them. This algorithm\n is good for data which contains clusters of similar density.\n\n This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\n from `sklearn`. Visit `TextClustering` step for an example of use.\n The trained model is saved as an artifact when creating a distiset\n and pushing it to the Hugging Face Hub.\n\n Input columns:\n - projection (`List[float]`): Vector representation of the text to cluster,\n normally the output from the `UMAP` step.\n\n Output columns:\n - cluster_label (`int`): Integer representing the label of a given cluster. -1\n means it wasn't clustered.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`DBSCAN demo of sklearn`](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#demo-of-dbscan-clustering-algorithm)\n - [`sklearn dbscan`](https://scikit-learn.org/stable/modules/clustering.html#dbscan)\n\n Attributes:\n - eps: The maximum distance between two samples for one to be considered as in the\n neighborhood of the other. This is not a maximum bound on the distances of\n points within a cluster. This is the most important DBSCAN parameter to\n choose appropriately for your data set and distance function.\n - min_samples: The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself. If `min_samples`\n is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n to a lower value, the found clusters will be more sparse.\n - metric: The metric to use when calculating distance between instances in a feature\n array. If metric is a string or callable, it must be one of the options allowed\n by `sklearn.metrics.pairwise_distances` for its metric parameter.\n - n_jobs: The number of parallel jobs to run.\n\n Runtime parameters:\n - `eps`: The maximum distance between two samples for one to be considered as in the\n neighborhood of the other. This is not a maximum bound on the distances of\n points within a cluster. This is the most important DBSCAN parameter to\n choose appropriately for your data set and distance function.\n - `min_samples`: The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself. If `min_samples`\n is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n to a lower value, the found clusters will be more sparse.\n - `metric`: The metric to use when calculating distance between instances in a feature\n array. If metric is a string or callable, it must be one of the options allowed\n by `sklearn.metrics.pairwise_distances` for its metric parameter.\n - `n_jobs`: The number of parallel jobs to run.\n \"\"\"\n\n eps: Optional[RuntimeParameter[float]] = Field(\n default=0.3,\n description=(\n \"The maximum distance between two samples for one to be considered \"\n \"as in the neighborhood of the other. This is not a maximum bound \"\n \"on the distances of points within a cluster. This is the most \"\n \"important DBSCAN parameter to choose appropriately for your data set \"\n \"and distance function.\"\n ),\n )\n min_samples: Optional[RuntimeParameter[int]] = Field(\n default=30,\n description=(\n \"The number of samples (or total weight) in a neighborhood for a point to \"\n \"be considered as a core point. This includes the point itself. If \"\n \"`min_samples` is set to a higher value, DBSCAN will find denser clusters, \"\n \"whereas if it is set to a lower value, the found clusters will be more \"\n \"sparse.\"\n ),\n )\n metric: Optional[RuntimeParameter[str]] = Field(\n default=\"euclidean\",\n description=(\n \"The metric to use when calculating distance between instances in a \"\n \"feature array. If metric is a string or callable, it must be one of \"\n \"the options allowed by `sklearn.metrics.pairwise_distances` for \"\n \"its metric parameter.\"\n ),\n )\n n_jobs: Optional[RuntimeParameter[int]] = Field(\n default=8, description=\"The number of parallel jobs to run.\"\n )\n\n _clusterer: Optional[\"_DBSCAN\"] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if importlib.util.find_spec(\"sklearn\") is None:\n raise ImportError(\n \"`sklearn` package is not installed. Please install it using `pip install scikit-learn`.\"\n )\n from sklearn.cluster import DBSCAN as _DBSCAN\n\n self._clusterer = _DBSCAN(\n eps=self.eps,\n min_samples=self.min_samples,\n metric=self.metric,\n n_jobs=self.n_jobs,\n )\n\n def unload(self) -> None:\n self._clusterer = None\n\n @property\n def inputs(self) -> List[str]:\n return [\"projection\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"cluster_label\"]\n\n def _save_model(self, model: Any) -> None:\n import joblib\n\n def save_model(path):\n with open(str(path / \"DBSCAN.joblib\"), \"wb\") as f:\n joblib.dump(model, f)\n\n self.save_artifact(\n name=\"DBSCAN_model\",\n write_function=lambda path: save_model(path),\n metadata={\n \"eps\": self.eps,\n \"min_samples\": self.min_samples,\n \"metric\": self.metric,\n },\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n projections = np.array([input[\"projection\"] for input in inputs])\n\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start training DBSCAN...\")\n fitted_clusterer = self._clusterer.fit(projections)\n cluster_labels = fitted_clusterer.labels_\n # Sets the cluster labels for each input, -1 means it wasn't clustered\n for input, cluster_label in zip(inputs, cluster_labels):\n input[\"cluster_label\"] = cluster_label\n self._logger.info(f\"DBSCAN labels assigned: {len(set(cluster_labels))}\")\n self._save_model(fitted_clusterer)\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering","title":"TextClustering ","text":" Bases: TextClassification , GlobalTask Task that clusters a set of texts and generates summary labels for each cluster. This is a GlobalTask that inherits from TextClassification , this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering Input columns - text (
str ): The reference text we want to obtain labels for. - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. Output columns - summary_label (
str ): The label or list of labels for the text. - model_name (
str ): The name of the model used to generate the label/s. Categories - clustering
- text-classification
References text-clustering repository Attributes: Name Type Description - savefig Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster The number of examples to use in the LLM as a sample of the cluster. Examples: Generate labels for a set of texts using clustering: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n Source code in src/distilabel/steps/clustering/text_clustering.py class TextClustering(TextClassification, GlobalTask):\n \"\"\"Task that clusters a set of texts and generates summary labels for each cluster.\n\n This is a `GlobalTask` that inherits from `TextClassification`, this means that all\n the attributes from that class are available here. Also, in this case we deal\n with all the inputs at once, instead of using batches. The `input_batch_size` is\n used here to send the examples to the LLM in batches (a subtle difference with the\n more common `Task` definitions).\n The task looks in each cluster for a given number of representative examples (the number\n is set by the `samples_per_cluster` attribute), and sends them to the LLM to get a label/s\n that represent the cluster. The labels are then assigned to each text in the cluster.\n The clusters and projections used in the step, are assumed to be obtained from the `UMAP`\n + `DBSCAN` steps, but could be generated for similar steps, as long as they represent the\n same concepts.\n This step runs a pipeline like the one in this repository:\n https://github.com/huggingface/text-clustering\n\n Input columns:\n - text (`str`): The reference text we want to obtain labels for.\n - projection (`List[float]`): Vector representation of the text to cluster,\n normally the output from the `UMAP` step.\n - cluster_label (`int`): Integer representing the label of a given cluster. -1\n means it wasn't clustered.\n\n Output columns:\n - summary_label (`str`): The label or list of labels for the text.\n - model_name (`str`): The name of the model used to generate the label/s.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`text-clustering repository`](https://github.com/huggingface/text-clustering)\n\n Attributes:\n - savefig: Whether to generate and save a figure with the clustering of the texts.\n - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.\n\n Examples:\n Generate labels for a set of texts using clustering:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps import UMAP, DBSCAN, TextClustering\n from distilabel.pipeline import Pipeline\n\n ds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\n with Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n ```\n \"\"\"\n\n savefig: Optional[RuntimeParameter[bool]] = Field(\n default=True,\n description=\"Whether to generate and save a figure with the clustering of the texts.\",\n )\n samples_per_cluster: int = Field(\n default=10,\n description=\"The number of examples to use in the LLM as a sample of the cluster.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the same as those for `TextClassification` plus\n the `projection` and `cluster_label` columns (which can be obtained from\n UMAP + DBSCAN steps).\n \"\"\"\n return super().inputs + [\"projection\", \"cluster_label\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `summary_label` and the `model_name`.\"\"\"\n return [\"summary_label\", \"model_name\"]\n\n def load(self) -> None:\n super().load()\n if self.savefig and (importlib.util.find_spec(\"matplotlib\") is None):\n raise ImportError(\n \"`matplotlib` package is not installed. Please install it using `pip install matplotlib`.\"\n )\n\n def _save_figure(\n self,\n data: pd.DataFrame,\n cluster_centers: Dict[str, Tuple[float, float]],\n cluster_summaries: Dict[int, str],\n ) -> None:\n \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n Args:\n data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n the projections and the label of each text respectively.\n cluster_centers: Dictionary mapping from each label the center of a cluster,\n to help with the placement of the annotations.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n import matplotlib.pyplot as plt\n\n fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n unique_labels = data[\"labels\"].unique()\n # Map of colors for each label (-1 is black)\n colormap = dict(\n zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n )\n colormap[-1] = np.array([0, 0, 0, 0])\n data[\"color\"] = data[\"labels\"].map(colormap)\n\n data.plot(\n kind=\"scatter\",\n x=\"X\",\n y=\"Y\",\n c=\"color\",\n s=0.75,\n alpha=0.8,\n linewidth=0.4,\n ax=ax,\n colorbar=False,\n )\n\n for label in cluster_summaries.keys():\n if label == -1:\n continue\n summary = str(cluster_summaries[label]) # These are obtained from the LLM\n position = cluster_centers[label]\n t = ax.text(\n position[0],\n position[1],\n summary,\n horizontalalignment=\"center\",\n verticalalignment=\"center\",\n fontsize=4,\n )\n t.set_bbox(\n {\n \"facecolor\": \"white\",\n \"alpha\": 0.9,\n \"linewidth\": 0,\n \"boxstyle\": \"square,pad=0.1\",\n }\n )\n\n ax.set_axis_off()\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"Text clusters\",\n write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n plt.close()\n\n def _create_figure(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[str]],\n cluster_summaries: Dict[int, str],\n ) -> None:\n \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n Args:\n inputs: The inputs of the step, as we will extract information from them again.\n label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n labels = []\n projections = []\n id2cluster = {}\n for i, input in enumerate(inputs):\n label = input[\"cluster_label\"]\n id2cluster[i] = label\n labels.append(label)\n projections.append(input[\"projection\"])\n\n projections = np.array(projections)\n\n # Contains the placement of the cluster centers in the figure\n cluster_centers: Dict[str, Tuple[float, float]] = {}\n for label in label2docs.keys():\n x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n cluster_centers[label] = (x, y)\n\n df = pd.DataFrame(\n data={\n \"X\": projections[:, 0],\n \"Y\": projections[:, 1],\n \"labels\": labels,\n }\n )\n\n self._save_figure(\n df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n )\n\n def _prepare_input_texts(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[int]],\n unique_labels: List[int],\n ) -> List[Dict[str, Union[str, int]]]:\n \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n Args:\n inputs: Inputs from the step.\n label2docs: Map from each label to the list of documents (texts) that\n belong to that cluster.\n unique_labels: The unique labels of the clusters.\n\n Returns:\n The input texts to send to the LLM, with the examples of each cluster\n prepared to be used in the prompt, and an additional key to store the\n labels (that will be needed to find the data after the batches are\n returned from the LLM).\n \"\"\"\n input_texts = []\n for label in range(unique_labels): # The label -1 is implicitly excluded\n # Get the ids but remove possible duplicates, which could happen with bigger probability\n # the bigger the number of examples requested, and the smaller the subset of examples\n ids = set(\n np.random.choice(label2docs[label], size=self.samples_per_cluster)\n ) # Grab the number of examples\n examples = [inputs[i][\"text\"] for i in ids]\n input_text = {\n \"text\": \"\\n\\n\".join(\n [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n ),\n \"__LABEL\": label,\n }\n input_texts.append(input_text)\n return input_texts\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n labels = [input[\"cluster_label\"] for input in inputs]\n # -1 because -1 is the label for the unclassified\n unique_labels = len(set(labels)) - 1\n # This will be the output of the LLM, the set of labels for each cluster\n cluster_summaries: Dict[int, str] = {-1: self.default_label}\n\n # Map from label to list of documents, will use them to select examples from each cluster\n label2docs = defaultdict(list)\n for i, label in enumerate(labels):\n label2docs[label].append(i)\n\n input_texts = self._prepare_input_texts(inputs, label2docs, unique_labels)\n\n # Send the texts in batches to the LLM, and get the labels for each cluster\n for i, batched_inputs in enumerate(batched(input_texts, self.input_batch_size)):\n self._logger.info(f\"\ud83d\udce6 Processing internal batch of inputs {i}...\")\n results = super().process(batched_inputs)\n for result in next(results): # Extract the elements from the generator\n cluster_summaries[result[\"__LABEL\"]] = result[\"labels\"]\n\n # Assign the labels to each text\n for input in inputs:\n input[\"summary_label\"] = json.dumps(\n cluster_summaries[input[\"cluster_label\"]]\n )\n\n if self.savefig:\n self._create_figure(inputs, label2docs, cluster_summaries)\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.inputs","title":"inputs: List[str] property ","text":"The input for the task are the same as those for TextClassification plus the projection and cluster_label columns (which can be obtained from UMAP + DBSCAN steps). "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.outputs","title":"outputs: List[str] property ","text":"The output for the task is the summary_label and the model_name . "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._save_figure","title":"_save_figure(data, cluster_centers, cluster_summaries) ","text":"Saves the figure starting from the dataframe, using matplotlib. Parameters: Name Type Description Default data DataFrame pd.DataFrame with the columns 'X', 'Y' and 'labels' representing the projections and the label of each text respectively. required cluster_centers Dict[str, Tuple[float, float]] Dictionary mapping from each label the center of a cluster, to help with the placement of the annotations. required cluster_summaries Dict[int, str] The summaries of the clusters, obtained from the LLM. required Source code in src/distilabel/steps/clustering/text_clustering.py def _save_figure(\n self,\n data: pd.DataFrame,\n cluster_centers: Dict[str, Tuple[float, float]],\n cluster_summaries: Dict[int, str],\n) -> None:\n \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n Args:\n data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n the projections and the label of each text respectively.\n cluster_centers: Dictionary mapping from each label the center of a cluster,\n to help with the placement of the annotations.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n import matplotlib.pyplot as plt\n\n fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n unique_labels = data[\"labels\"].unique()\n # Map of colors for each label (-1 is black)\n colormap = dict(\n zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n )\n colormap[-1] = np.array([0, 0, 0, 0])\n data[\"color\"] = data[\"labels\"].map(colormap)\n\n data.plot(\n kind=\"scatter\",\n x=\"X\",\n y=\"Y\",\n c=\"color\",\n s=0.75,\n alpha=0.8,\n linewidth=0.4,\n ax=ax,\n colorbar=False,\n )\n\n for label in cluster_summaries.keys():\n if label == -1:\n continue\n summary = str(cluster_summaries[label]) # These are obtained from the LLM\n position = cluster_centers[label]\n t = ax.text(\n position[0],\n position[1],\n summary,\n horizontalalignment=\"center\",\n verticalalignment=\"center\",\n fontsize=4,\n )\n t.set_bbox(\n {\n \"facecolor\": \"white\",\n \"alpha\": 0.9,\n \"linewidth\": 0,\n \"boxstyle\": \"square,pad=0.1\",\n }\n )\n\n ax.set_axis_off()\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"Text clusters\",\n write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n plt.close()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._create_figure","title":"_create_figure(inputs, label2docs, cluster_summaries) ","text":"Creates a figure of the clustered texts and save it as an artifact. Parameters: Name Type Description Default inputs StepInput The inputs of the step, as we will extract information from them again. required label2docs Dict[int, List[str]] Map from each label to the list of documents (texts) that belong to that cluster. required cluster_summaries Dict[int, str] The summaries of the clusters, obtained from the LLM. required Source code in src/distilabel/steps/clustering/text_clustering.py def _create_figure(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[str]],\n cluster_summaries: Dict[int, str],\n) -> None:\n \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n Args:\n inputs: The inputs of the step, as we will extract information from them again.\n label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n cluster_summaries: The summaries of the clusters, obtained from the LLM.\n \"\"\"\n self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n labels = []\n projections = []\n id2cluster = {}\n for i, input in enumerate(inputs):\n label = input[\"cluster_label\"]\n id2cluster[i] = label\n labels.append(label)\n projections.append(input[\"projection\"])\n\n projections = np.array(projections)\n\n # Contains the placement of the cluster centers in the figure\n cluster_centers: Dict[str, Tuple[float, float]] = {}\n for label in label2docs.keys():\n x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n cluster_centers[label] = (x, y)\n\n df = pd.DataFrame(\n data={\n \"X\": projections[:, 0],\n \"Y\": projections[:, 1],\n \"labels\": labels,\n }\n )\n\n self._save_figure(\n df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._prepare_input_texts","title":"_prepare_input_texts(inputs, label2docs, unique_labels) ","text":"Prepares a batch of inputs to send to the LLM, with the examples of each cluster. Parameters: Name Type Description Default inputs StepInput Inputs from the step. required label2docs Dict[int, List[int]] Map from each label to the list of documents (texts) that belong to that cluster. required unique_labels List[int] The unique labels of the clusters. required Returns: Type Description List[Dict[str, Union[str, int]]] The input texts to send to the LLM, with the examples of each cluster List[Dict[str, Union[str, int]]] prepared to be used in the prompt, and an additional key to store the List[Dict[str, Union[str, int]]] labels (that will be needed to find the data after the batches are List[Dict[str, Union[str, int]]] returned from the LLM). Source code in src/distilabel/steps/clustering/text_clustering.py def _prepare_input_texts(\n self,\n inputs: StepInput,\n label2docs: Dict[int, List[int]],\n unique_labels: List[int],\n) -> List[Dict[str, Union[str, int]]]:\n \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n Args:\n inputs: Inputs from the step.\n label2docs: Map from each label to the list of documents (texts) that\n belong to that cluster.\n unique_labels: The unique labels of the clusters.\n\n Returns:\n The input texts to send to the LLM, with the examples of each cluster\n prepared to be used in the prompt, and an additional key to store the\n labels (that will be needed to find the data after the batches are\n returned from the LLM).\n \"\"\"\n input_texts = []\n for label in range(unique_labels): # The label -1 is implicitly excluded\n # Get the ids but remove possible duplicates, which could happen with bigger probability\n # the bigger the number of examples requested, and the smaller the subset of examples\n ids = set(\n np.random.choice(label2docs[label], size=self.samples_per_cluster)\n ) # Grab the number of examples\n examples = [inputs[i][\"text\"] for i in ids]\n input_text = {\n \"text\": \"\\n\\n\".join(\n [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n ),\n \"__LABEL\": label,\n }\n input_texts.append(input_text)\n return input_texts\n "},{"location":"api/step_gallery/extra/#distilabel.steps.UMAP","title":"UMAP ","text":" Bases: GlobalStep UMAP is a general purpose manifold learning and dimension reduction algorithm. This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. Input columns - embedding (
List[float] ): The original embeddings we want to reduce the dimension. Output columns - projection (
List[float] ): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components . Categories - clustering
- text-classification
References UMAP repository UMAP documentation Attributes: Name Type Description - n_components The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . - n_jobs The number of parallel jobs to run. Defaults to 8 . - random_state The random state to use for the UMAP algorithm. Runtime parameters n_components : The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. metric : The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . n_jobs : The number of parallel jobs to run. Defaults to 8 . random_state : The random state to use for the UMAP algorithm. Citations @misc{mcinnes2020umapuniformmanifoldapproximation,\n title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n author={Leland McInnes and John Healy and James Melville},\n year={2020},\n eprint={1802.03426},\n archivePrefix={arXiv},\n primaryClass={stat.ML},\n url={https://arxiv.org/abs/1802.03426},\n}\n Source code in src/distilabel/steps/clustering/umap.py class UMAP(GlobalStep):\n r\"\"\"UMAP is a general purpose manifold learning and dimension reduction algorithm.\n\n This is a `GlobalStep` that reduces the dimensionality of the embeddings using. Visit\n the `TextClustering` step for an example of use. The trained model is saved as an artifact\n when creating a distiset and pushing it to the Hugging Face Hub.\n\n Input columns:\n - embedding (`List[float]`): The original embeddings we want to reduce the dimension.\n\n Output columns:\n - projection (`List[float]`): Embedding reduced to the number of components specified,\n the size of the new embeddings will be determined by the `n_components`.\n\n Categories:\n - clustering\n - text-classification\n\n References:\n - [`UMAP repository`](https://github.com/lmcinnes/umap/tree/master)\n - [`UMAP documentation`](https://umap-learn.readthedocs.io/en/latest/)\n\n Attributes:\n - n_components: The dimension of the space to embed into. This defaults to 2 to\n provide easy visualization (that's probably what you want), but can\n reasonably be set to any integer value in the range 2 to 100.\n - metric: The metric to use to compute distances in high dimensional space.\n Visit UMAP's documentation for more information. Defaults to `euclidean`.\n - n_jobs: The number of parallel jobs to run. Defaults to `8`.\n - random_state: The random state to use for the UMAP algorithm.\n\n Runtime parameters:\n - `n_components`: The dimension of the space to embed into. This defaults to 2 to\n provide easy visualization (that's probably what you want), but can\n reasonably be set to any integer value in the range 2 to 100.\n - `metric`: The metric to use to compute distances in high dimensional space.\n Visit UMAP's documentation for more information. Defaults to `euclidean`.\n - `n_jobs`: The number of parallel jobs to run. Defaults to `8`.\n - `random_state`: The random state to use for the UMAP algorithm.\n\n Citations:\n ```\n @misc{mcinnes2020umapuniformmanifoldapproximation,\n title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n author={Leland McInnes and John Healy and James Melville},\n year={2020},\n eprint={1802.03426},\n archivePrefix={arXiv},\n primaryClass={stat.ML},\n url={https://arxiv.org/abs/1802.03426},\n }\n ```\n \"\"\"\n\n n_components: Optional[RuntimeParameter[int]] = Field(\n default=2,\n description=(\n \"The dimension of the space to embed into. This defaults to 2 to \"\n \"provide easy visualization, but can reasonably be set to any \"\n \"integer value in the range 2 to 100.\"\n ),\n )\n metric: Optional[RuntimeParameter[str]] = Field(\n default=\"euclidean\",\n description=(\n \"The metric to use to compute distances in high dimensional space. \"\n \"Visit UMAP's documentation for more information.\"\n ),\n )\n n_jobs: Optional[RuntimeParameter[int]] = Field(\n default=8, description=\"The number of parallel jobs to run.\"\n )\n random_state: Optional[RuntimeParameter[int]] = Field(\n default=None, description=\"The random state to use for the UMAP algorithm.\"\n )\n\n _umap: Optional[\"_UMAP\"] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if importlib.util.find_spec(\"umap\") is None:\n raise ImportError(\n \"`umap` package is not installed. Please install it using `pip install umap-learn`.\"\n )\n from umap import UMAP as _UMAP\n\n self._umap = _UMAP(\n n_components=self.n_components,\n metric=self.metric,\n n_jobs=self.n_jobs,\n random_state=self.random_state,\n )\n\n def unload(self) -> None:\n self._umap = None\n\n @property\n def inputs(self) -> List[str]:\n return [\"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"projection\"]\n\n def _save_model(self, model: Any) -> None:\n import joblib\n\n def save_model(path):\n with open(str(path / \"UMAP.joblib\"), \"wb\") as f:\n joblib.dump(model, f)\n\n self.save_artifact(\n name=\"UMAP_model\",\n write_function=lambda path: save_model(path),\n metadata={\n \"n_components\": self.n_components,\n \"metric\": self.metric,\n },\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n # Shape of the embeddings is (n_samples, n_features)\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start UMAP training...\")\n mapper = self._umap.fit(embeddings)\n # Shape of the projection will be (n_samples, n_components)\n for input, projection in zip(inputs, mapper.embedding_):\n input[\"projection\"] = projection\n\n self._save_model(mapper)\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.CombineOutputs","title":"CombineOutputs ","text":" Bases: Step Combine the outputs of several upstream steps. CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs. Input columns - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. Output columns - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. Categories Examples: Combine dictionaries of a dataset:\n\n```python\nfrom distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n)\n# [\n# {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n# {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n```\n\nCombine upstream steps outputs in a pipeline:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n```\n Source code in src/distilabel/steps/columns/combine.py class CombineOutputs(Step):\n \"\"\"Combine the outputs of several upstream steps.\n\n `CombineOutputs` is a `Step` that takes the outputs of several upstream steps and combines\n them to generate a new dictionary with all keys/columns of the upstream steps outputs.\n\n Input columns:\n - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n Output columns:\n - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n Categories:\n - columns\n\n Examples:\n\n Combine dictionaries of a dataset:\n\n ```python\n from distilabel.steps import CombineOutputs\n\n combine_outputs = CombineOutputs()\n combine_outputs.load()\n\n result = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n )\n # [\n # {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n # {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n # ]\n ```\n\n Combine upstream steps outputs in a pipeline:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import CombineOutputs\n\n with Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n ```\n \"\"\"\n\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n combined_outputs = []\n for output_dicts in zip(*inputs):\n combined_dict = {}\n for output_dict in output_dicts:\n combined_dict.update(\n {\n k: v\n for k, v in output_dict.items()\n if k != DISTILABEL_METADATA_KEY\n }\n )\n\n if any(\n DISTILABEL_METADATA_KEY in output_dict for output_dict in output_dicts\n ):\n combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n *output_dicts\n )\n combined_outputs.append(combined_dict)\n\n yield combined_outputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering","title":"DeitaFiltering ","text":" Bases: GlobalStep Filter dataset rows using DEITA filtering strategy. Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description data_budget RuntimeParameter[int] The desired size of the dataset after filtering. diversity_threshold RuntimeParameter[float] If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9 . normalize_embeddings RuntimeParameter[bool] Whether to normalize the embeddings before computing the cosine distance. Defaults to True . Runtime parameters data_budget : The desired size of the dataset after filtering. diversity_threshold : If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Input columns - evol_instruction_score (
float ): The score of the instruction generated by ComplexityScorer step. - evol_response_score (
float ): The score of the response generated by QualityScorer step. - embedding (
List[float] ): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step. Output columns - deita_score (
float ): The DEITA score for the instruction-response pair. - deita_score_computed_with (
List[str] ): The scores used to compute the DEITA score. - nearest_neighbor_distance (
float ): The cosine distance between the embeddings of the instruction-response pair. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Filter the dataset based on the DEITA score and the cosine distance between the embeddings: from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/deita.py class DeitaFiltering(GlobalStep):\n \"\"\"Filter dataset rows using DEITA filtering strategy.\n\n Filter the dataset based on the DEITA score and the cosine distance between the embeddings.\n It's an implementation of the filtering step from the paper 'What Makes Good Data\n for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n data_budget: The desired size of the dataset after filtering.\n diversity_threshold: If a row has a cosine distance with respect to it's nearest\n neighbor greater than this value, it will be included in the filtered dataset.\n Defaults to `0.9`.\n normalize_embeddings: Whether to normalize the embeddings before computing the cosine\n distance. Defaults to `True`.\n\n Runtime parameters:\n - `data_budget`: The desired size of the dataset after filtering.\n - `diversity_threshold`: If a row has a cosine distance with respect to it's nearest\n neighbor greater than this value, it will be included in the filtered dataset.\n\n Input columns:\n - evol_instruction_score (`float`): The score of the instruction generated by\n `ComplexityScorer` step.\n - evol_response_score (`float`): The score of the response generated by\n `QualityScorer` step.\n - embedding (`List[float]`): The embedding generated for the conversation of the\n instruction-response pair using `GenerateEmbeddings` step.\n\n Output columns:\n - deita_score (`float`): The DEITA score for the instruction-response pair.\n - deita_score_computed_with (`List[str]`): The scores used to compute the DEITA\n score.\n - nearest_neighbor_distance (`float`): The cosine distance between the embeddings\n of the instruction-response pair.\n\n Categories:\n - filtering\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Filter the dataset based on the DEITA score and the cosine distance between the embeddings:\n\n ```python\n from distilabel.steps import DeitaFiltering\n\n deita_filtering = DeitaFiltering(data_budget=1)\n\n deita_filtering.load()\n\n result = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n )\n # >>> result\n # [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n data_budget: RuntimeParameter[int] = Field(\n default=None, description=\"The desired size of the dataset after filtering.\"\n )\n diversity_threshold: RuntimeParameter[float] = Field(\n default=0.9,\n description=\"If a row has a cosine distance with respect to it's nearest neighbor\"\n \" greater than this value, it will be included in the filtered dataset.\",\n )\n normalize_embeddings: RuntimeParameter[bool] = Field(\n default=True,\n description=\"Whether to normalize the embeddings before computing the cosine distance.\",\n )\n distance_metric: RuntimeParameter[Literal[\"cosine\", \"manhattan\"]] = Field(\n default=\"cosine\",\n description=\"The distance metric to use. Currently only 'cosine' is supported.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"evol_instruction_score\", \"evol_response_score\", \"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"deita_score\", \"nearest_neighbor_distance\", \"deita_score_computed_with\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n embeddings.\n\n Args:\n inputs: The input data.\n\n Returns:\n The filtered dataset.\n \"\"\"\n inputs = self._compute_deita_score(inputs)\n inputs = self._compute_nearest_neighbor(inputs)\n inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n selected_rows = []\n for input in inputs:\n if len(selected_rows) >= self.data_budget: # type: ignore\n break\n if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n selected_rows.append(input)\n yield selected_rows\n\n def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n the product of the instruction score and the response score.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the DEITA score computed.\n \"\"\"\n for input_ in inputs:\n evol_instruction_score = input_.get(\"evol_instruction_score\")\n evol_response_score = input_.get(\"evol_response_score\")\n\n if evol_instruction_score and evol_response_score:\n deita_score = evol_instruction_score * evol_response_score\n score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n elif evol_instruction_score:\n self._logger.warning(\n \"Response score is missing for the instruction-response pair. Using\"\n \" instruction score as DEITA score.\"\n )\n deita_score = evol_instruction_score\n score_computed_with = [\"evol_instruction_score\"]\n elif evol_response_score:\n self._logger.warning(\n \"Instruction score is missing for the instruction-response pair. Using\"\n \" response score as DEITA score.\"\n )\n deita_score = evol_response_score\n score_computed_with = [\"evol_response_score\"]\n else:\n self._logger.warning(\n \"Instruction and response scores are missing for the instruction-response\"\n \" pair. Setting DEITA score to 0.\"\n )\n deita_score = 0\n score_computed_with = []\n\n input_.update(\n {\n \"deita_score\": deita_score,\n \"deita_score_computed_with\": score_computed_with,\n }\n )\n return inputs\n\n def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n pairs and the nearest neighbor.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the cosine distance computed.\n \"\"\"\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n if self.normalize_embeddings:\n embeddings = self._normalize_embeddings(embeddings)\n self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n if self.distance_metric == \"cosine\":\n self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n distances = self._cosine_distance(embeddings)\n else:\n self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n distances = self._manhattan_distance(embeddings)\n\n for distance, input in zip(distances, inputs):\n input[\"nearest_neighbor_distance\"] = distance\n return inputs\n\n def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n \"\"\"Normalize the embeddings.\n\n Args:\n embeddings: The embeddings to normalize.\n\n Returns:\n The normalized embeddings.\n \"\"\"\n self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n return embeddings / norms\n\n def _cosine_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the cosine distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The cosine distance between the embeddings.\n \"\"\"\n cosine_similarity = np.dot(embeddings, embeddings.T)\n cosine_distance = 1 - cosine_similarity\n # Ignore self-distance\n np.fill_diagonal(cosine_distance, np.inf)\n return np.min(cosine_distance, axis=1)\n\n def _manhattan_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the manhattan distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The manhattan distance between the embeddings.\n \"\"\"\n manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n # Ignore self-distance\n np.fill_diagonal(manhattan_distance, np.inf)\n return np.min(manhattan_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering.process","title":"process(inputs) ","text":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepOutput The filtered dataset. Source code in src/distilabel/steps/deita.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n embeddings.\n\n Args:\n inputs: The input data.\n\n Returns:\n The filtered dataset.\n \"\"\"\n inputs = self._compute_deita_score(inputs)\n inputs = self._compute_nearest_neighbor(inputs)\n inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n selected_rows = []\n for input in inputs:\n if len(selected_rows) >= self.data_budget: # type: ignore\n break\n if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n selected_rows.append(input)\n yield selected_rows\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_deita_score","title":"_compute_deita_score(inputs) ","text":"Computes the DEITA score for each instruction-response pair. The DEITA score is the product of the instruction score and the response score. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepInput The input data with the DEITA score computed. Source code in src/distilabel/steps/deita.py def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n the product of the instruction score and the response score.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the DEITA score computed.\n \"\"\"\n for input_ in inputs:\n evol_instruction_score = input_.get(\"evol_instruction_score\")\n evol_response_score = input_.get(\"evol_response_score\")\n\n if evol_instruction_score and evol_response_score:\n deita_score = evol_instruction_score * evol_response_score\n score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n elif evol_instruction_score:\n self._logger.warning(\n \"Response score is missing for the instruction-response pair. Using\"\n \" instruction score as DEITA score.\"\n )\n deita_score = evol_instruction_score\n score_computed_with = [\"evol_instruction_score\"]\n elif evol_response_score:\n self._logger.warning(\n \"Instruction score is missing for the instruction-response pair. Using\"\n \" response score as DEITA score.\"\n )\n deita_score = evol_response_score\n score_computed_with = [\"evol_response_score\"]\n else:\n self._logger.warning(\n \"Instruction and response scores are missing for the instruction-response\"\n \" pair. Setting DEITA score to 0.\"\n )\n deita_score = 0\n score_computed_with = []\n\n input_.update(\n {\n \"deita_score\": deita_score,\n \"deita_score_computed_with\": score_computed_with,\n }\n )\n return inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_nearest_neighbor","title":"_compute_nearest_neighbor(inputs) ","text":"Computes the cosine distance between the embeddings of the instruction-response pairs and the nearest neighbor. Parameters: Name Type Description Default inputs StepInput The input data. required Returns: Type Description StepInput The input data with the cosine distance computed. Source code in src/distilabel/steps/deita.py def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n pairs and the nearest neighbor.\n\n Args:\n inputs: The input data.\n\n Returns:\n The input data with the cosine distance computed.\n \"\"\"\n embeddings = np.array([input[\"embedding\"] for input in inputs])\n if self.normalize_embeddings:\n embeddings = self._normalize_embeddings(embeddings)\n self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n if self.distance_metric == \"cosine\":\n self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n distances = self._cosine_distance(embeddings)\n else:\n self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n distances = self._manhattan_distance(embeddings)\n\n for distance, input in zip(distances, inputs):\n input[\"nearest_neighbor_distance\"] = distance\n return inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._normalize_embeddings","title":"_normalize_embeddings(embeddings) ","text":"Normalize the embeddings. Parameters: Name Type Description Default embeddings ndarray The embeddings to normalize. required Returns: Type Description ndarray The normalized embeddings. Source code in src/distilabel/steps/deita.py def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n \"\"\"Normalize the embeddings.\n\n Args:\n embeddings: The embeddings to normalize.\n\n Returns:\n The normalized embeddings.\n \"\"\"\n self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n return embeddings / norms\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._cosine_distance","title":"_cosine_distance(embeddings) ","text":"Computes the cosine distance between the embeddings. Parameters: Name Type Description Default embeddings array The embeddings. required Returns: Type Description array The cosine distance between the embeddings. Source code in src/distilabel/steps/deita.py def _cosine_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the cosine distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The cosine distance between the embeddings.\n \"\"\"\n cosine_similarity = np.dot(embeddings, embeddings.T)\n cosine_distance = 1 - cosine_similarity\n # Ignore self-distance\n np.fill_diagonal(cosine_distance, np.inf)\n return np.min(cosine_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._manhattan_distance","title":"_manhattan_distance(embeddings) ","text":"Computes the manhattan distance between the embeddings. Parameters: Name Type Description Default embeddings array The embeddings. required Returns: Type Description array The manhattan distance between the embeddings. Source code in src/distilabel/steps/deita.py def _manhattan_distance(self, embeddings: np.array) -> np.array: # type: ignore\n \"\"\"Computes the manhattan distance between the embeddings.\n\n Args:\n embeddings: The embeddings.\n\n Returns:\n The manhattan distance between the embeddings.\n \"\"\"\n manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n # Ignore self-distance\n np.fill_diagonal(manhattan_distance, np.inf)\n return np.min(manhattan_distance, axis=1)\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration","title":"EmbeddingGeneration ","text":" Bases: Step Generate embeddings using an Embeddings model. EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts. Attributes: Name Type Description embeddings Embeddings the Embeddings model used to generate the sentence embeddings. Input columns - text (
str ): The text for which the sentence embedding has to be generated. Output columns - embedding (
List[Union[float, int]] ): the generated sentence embedding. Categories Examples: Generate sentence embeddings with Sentence Transformers: from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n Source code in src/distilabel/steps/embeddings/embedding_generation.py class EmbeddingGeneration(Step):\n \"\"\"Generate embeddings using an `Embeddings` model.\n\n `EmbeddingGeneration` is a `Step` that using an `Embeddings` model generates sentence\n embeddings for the provided input texts.\n\n Attributes:\n embeddings: the `Embeddings` model used to generate the sentence embeddings.\n\n Input columns:\n - text (`str`): The text for which the sentence embedding has to be generated.\n\n Output columns:\n - embedding (`List[Union[float, int]]`): the generated sentence embedding.\n\n Categories:\n - embedding\n\n Examples:\n Generate sentence embeddings with Sentence Transformers:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n from distilabel.steps import EmbeddingGeneration\n\n embedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n )\n\n embedding_generation.load()\n\n result = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n # [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n ```\n\n \"\"\"\n\n embeddings: Embeddings\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"text\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"embedding\", \"model_name\"]\n\n def load(self) -> None:\n \"\"\"Loads the `Embeddings` model.\"\"\"\n super().load()\n\n self.embeddings.load()\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n embeddings = self.embeddings.encode(inputs=[input[\"text\"] for input in inputs])\n for input, embedding in zip(inputs, embeddings):\n input[\"embedding\"] = embedding\n input[\"model_name\"] = self.embeddings.model_name\n yield inputs\n\n def unload(self) -> None:\n super().unload()\n self.embeddings.unload()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration.load","title":"load() ","text":"Loads the Embeddings model. Source code in src/distilabel/steps/embeddings/embedding_generation.py def load(self) -> None:\n \"\"\"Loads the `Embeddings` model.\"\"\"\n super().load()\n\n self.embeddings.load()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour","title":"FaissNearestNeighbour ","text":" Bases: GlobalStep Create a faiss index to get the nearest neighbours. FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row. Attributes: Name Type Description device Optional[RuntimeParameter[Union[int, List[int]]]] the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . string_factory Optional[RuntimeParameter[str]] the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . metric_type Optional[RuntimeParameter[int]] the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . k Optional[RuntimeParameter[int]] the number of nearest neighbours to search for each input row. Defaults to 1 . search_batch_size Optional[RuntimeParameter[int]] the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . train_size Optional[RuntimeParameter[int]] If the index needs a training step, specifies how many vectors will be used to train the index. Runtime parameters device : the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . string_factory : the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . metric_type : the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . k : the number of nearest neighbours to search for each input row. Defaults to 1 . search_batch_size : the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . train_size : If the index needs a training step, specifies how many vectors will be used to train the index. Input columns - embedding (
List[Union[float, int]] ): a sentence embedding. Output columns - nn_indices (
List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. - nn_scores (
List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. Categories References Examples: Generating embeddings and getting the nearest neighbours: from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n Citations @misc{douze2024faisslibrary,\n title={The Faiss library},\n author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n year={2024},\n eprint={2401.08281},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2401.08281},\n}\n Source code in src/distilabel/steps/embeddings/nearest_neighbour.py class FaissNearestNeighbour(GlobalStep):\n \"\"\"Create a `faiss` index to get the nearest neighbours.\n\n `FaissNearestNeighbour` is a `GlobalStep` that creates a `faiss` index using the Hugging\n Face `datasets` library integration, and then gets the nearest neighbours and the scores\n or distance of the nearest neighbours for each input row.\n\n Attributes:\n device: the CUDA device ID or a list of IDs to be used. If negative integer, it\n will use all the available GPUs. Defaults to `None`.\n string_factory: the name of the factory to be used to build the `faiss` index.\n Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n Defaults to `None`.\n metric_type: the metric to be used to measure the distance between the points. It's\n an integer and the recommend way to pass it is importing `faiss` and then passing\n one of `faiss.METRIC_x` variables. Defaults to `None`.\n k: the number of nearest neighbours to search for each input row. Defaults to `1`.\n search_batch_size: the number of rows to include in a search batch. The value can\n be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n to `50`.\n train_size: If the index needs a training step, specifies how many vectors will be\n used to train the index.\n\n Runtime parameters:\n - `device`: the CUDA device ID or a list of IDs to be used. If negative integer,\n it will use all the available GPUs. Defaults to `None`.\n - `string_factory`: the name of the factory to be used to build the `faiss` index.\n Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n Defaults to `None`.\n - `metric_type`: the metric to be used to measure the distance between the points.\n It's an integer and the recommend way to pass it is importing `faiss` and then\n passing one of `faiss.METRIC_x` variables. Defaults to `None`.\n - `k`: the number of nearest neighbours to search for each input row. Defaults to `1`.\n - `search_batch_size`: the number of rows to include in a search batch. The value\n can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n to `50`.\n - `train_size`: If the index needs a training step, specifies how many vectors will\n be used to train the index.\n\n Input columns:\n - embedding (`List[Union[float, int]]`): a sentence embedding.\n\n Output columns:\n - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n in the inputs for the row.\n - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n nearest neighbour in the inputs.\n\n Categories:\n - embedding\n\n References:\n - [`The Faiss library`](https://arxiv.org/abs/2401.08281)\n\n Examples:\n Generating embeddings and getting the nearest neighbours:\n\n ```python\n from distilabel.models import SentenceTransformerEmbeddings\n from distilabel.pipeline import Pipeline\n from distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\n with Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n ```\n\n Citations:\n ```\n @misc{douze2024faisslibrary,\n title={The Faiss library},\n author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n year={2024},\n eprint={2401.08281},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2401.08281},\n }\n ```\n \"\"\"\n\n device: Optional[RuntimeParameter[Union[int, List[int]]]] = Field(\n default=None,\n description=\"The CUDA device ID or a list of IDs to be used. If negative integer,\"\n \" it will use all the available GPUs.\",\n )\n string_factory: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The name of the factory to be used to build the `faiss` index.\"\n \"Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\",\n )\n metric_type: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"The metric to be used to measure the distance between the points. It's\"\n \" an integer and the recommend way to pass it is importing `faiss` and thenpassing\"\n \" one of `faiss.METRIC_x` variables.\",\n )\n k: Optional[RuntimeParameter[int]] = Field(\n default=1,\n description=\"The number of nearest neighbours to search for each input row.\",\n )\n search_batch_size: Optional[RuntimeParameter[int]] = Field(\n default=50,\n description=\"The number of rows to include in a search batch. The value can be adjusted\"\n \" to maximize the resources usage or to avoid OOM issues.\",\n )\n train_size: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"If the index needs a training step, specifies how many vectors will be used to train the index.\",\n )\n\n def load(self) -> None:\n super().load()\n\n if importlib.util.find_spec(\"faiss\") is None:\n raise ImportError(\n \"`faiss` package is not installed. Please install it using `pip install\"\n \" faiss-cpu` or `pip install faiss-gpu`.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"embedding\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"nn_indices\", \"nn_scores\"]\n\n def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n \"\"\"Builds a `faiss` index using `datasets` integration.\n\n Args:\n inputs: a list of dictionaries.\n\n Returns:\n The build `datasets.Dataset` with its `faiss` index.\n \"\"\"\n dataset = Dataset.from_list(inputs)\n if self.train_size is not None and self.string_factory:\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n dataset.add_faiss_index(\n column=\"embedding\",\n device=self.device, # type: ignore\n string_factory=self.string_factory,\n metric_type=self.metric_type,\n train_size=self.train_size,\n )\n return dataset\n\n def _save_index(self, dataset: Dataset) -> None:\n \"\"\"Save the generated Faiss index as an artifact of the step.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n \"\"\"\n self.save_artifact(\n name=\"faiss_index\",\n write_function=lambda path: dataset.save_faiss_index(\n index_name=\"embedding\", file=path / \"index.faiss\"\n ),\n metadata={\n \"num_rows\": len(dataset),\n \"embedding_dim\": len(dataset[0][\"embedding\"]),\n },\n )\n\n def _search(self, dataset: Dataset) -> Dataset:\n \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n\n Returns:\n The updated dataset containing the top `k` nearest neighbours for each row,\n as well as the score or distance.\n \"\"\"\n\n def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n queries = np.array(examples[\"embedding\"])\n results = dataset.search_batch(\n index_name=\"embedding\",\n queries=queries,\n k=self.k + 1, # type: ignore\n )\n examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n return examples\n\n return dataset.map(\n add_search_results, batched=True, batch_size=self.search_batch_size\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n dataset = self._build_index(inputs)\n dataset_with_search_results = self._search(dataset)\n self._save_index(dataset)\n yield dataset_with_search_results.to_list()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._build_index","title":"_build_index(inputs) ","text":"Builds a faiss index using datasets integration. Parameters: Name Type Description Default inputs List[Dict[str, Any]] a list of dictionaries. required Returns: Type Description Dataset The build datasets.Dataset with its faiss index. Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n \"\"\"Builds a `faiss` index using `datasets` integration.\n\n Args:\n inputs: a list of dictionaries.\n\n Returns:\n The build `datasets.Dataset` with its `faiss` index.\n \"\"\"\n dataset = Dataset.from_list(inputs)\n if self.train_size is not None and self.string_factory:\n self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n dataset.add_faiss_index(\n column=\"embedding\",\n device=self.device, # type: ignore\n string_factory=self.string_factory,\n metric_type=self.metric_type,\n train_size=self.train_size,\n )\n return dataset\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._save_index","title":"_save_index(dataset) ","text":"Save the generated Faiss index as an artifact of the step. Parameters: Name Type Description Default dataset Dataset the dataset with the faiss index built. required Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _save_index(self, dataset: Dataset) -> None:\n \"\"\"Save the generated Faiss index as an artifact of the step.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n \"\"\"\n self.save_artifact(\n name=\"faiss_index\",\n write_function=lambda path: dataset.save_faiss_index(\n index_name=\"embedding\", file=path / \"index.faiss\"\n ),\n metadata={\n \"num_rows\": len(dataset),\n \"embedding_dim\": len(dataset[0][\"embedding\"]),\n },\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._search","title":"_search(dataset) ","text":"Search the top k nearest neighbours for each row in the dataset. Parameters: Name Type Description Default dataset Dataset the dataset with the faiss index built. required Returns: Type Description Dataset The updated dataset containing the top k nearest neighbours for each row, Dataset as well as the score or distance. Source code in src/distilabel/steps/embeddings/nearest_neighbour.py def _search(self, dataset: Dataset) -> Dataset:\n \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n Args:\n dataset: the dataset with the `faiss` index built.\n\n Returns:\n The updated dataset containing the top `k` nearest neighbours for each row,\n as well as the score or distance.\n \"\"\"\n\n def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n queries = np.array(examples[\"embedding\"])\n results = dataset.search_batch(\n index_name=\"embedding\",\n queries=queries,\n k=self.k + 1, # type: ignore\n )\n examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n return examples\n\n return dataset.map(\n add_search_results, batched=True, batch_size=self.search_batch_size\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingDedup","title":"EmbeddingDedup ","text":" Bases: GlobalStep Deduplicates text using embeddings. EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour ) using the nn_indices and nn_scores , determine the texts that are duplicate. Attributes: Name Type Description threshold Optional[RuntimeParameter[float]] the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9 . Runtime Parameters threshold : the threshold to consider 2 examples as duplicates. Input columns - nn_indices (
List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. - nn_scores (
List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. Output columns - keep_row_after_embedding_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. Categories Examples: Deduplicate a list of texts using embedding information:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n```\n Source code in src/distilabel/steps/filtering/embedding.py class EmbeddingDedup(GlobalStep):\n \"\"\"Deduplicates text using embeddings.\n\n `EmbeddingDedup` is a Step that detects near-duplicates in datasets, using\n embeddings to compare the similarity between the texts. The typical workflow with this step\n would include having a dataset with embeddings precomputed, and then (possibly using the\n `FaissNearestNeighbour`) using the `nn_indices` and `nn_scores`, determine the texts that\n are duplicate.\n\n Attributes:\n threshold: the threshold to consider 2 examples as duplicates.\n It's dependent on the type of index that was used to generate the embeddings.\n For example, if the embeddings were generated using cosine similarity, a threshold\n of `0.9` would make all the texts with a cosine similarity above the value\n duplicates. Higher values detect less duplicates in such an index, but that should\n be taken into account when building it. Defaults to `0.9`.\n\n Runtime Parameters:\n - `threshold`: the threshold to consider 2 examples as duplicates.\n\n Input columns:\n - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n in the inputs for the row.\n - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n nearest neighbour in the inputs.\n\n Output columns:\n - keep_row_after_embedding_filtering (`bool`): boolean indicating if the piece `text` is\n not a duplicate i.e. this text should be kept.\n\n Categories:\n - filtering\n\n Examples:\n\n Deduplicate a list of texts using embedding information:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import EmbeddingDedup\n from distilabel.steps import LoadDataFromDicts\n\n with Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n ```\n \"\"\"\n\n threshold: Optional[RuntimeParameter[float]] = Field(\n default=0.9,\n description=\"The threshold to consider 2 examples as duplicates. It's dependent \"\n \"on the type of index that was used to generate the embeddings. For example, if \"\n \"the embeddings were generated using cosine similarity, a threshold of `0.9` \"\n \"would make all the texts with a cosine similarity above the value duplicates. \"\n \"Higher values detect less duplicates in such an index, but that should be \"\n \"taken into account when building it.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"nn_scores\", \"nn_indices\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"keep_row_after_embedding_filtering\"]\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n rows_to_remove = set()\n\n for input in track(inputs, description=\"Running Embedding deduplication...\"):\n input[\"keep_row_after_embedding_filtering\"] = True\n indices_scores = np.array(input[\"nn_scores\"]) > self.threshold\n indices = np.array(input[\"nn_indices\"])[indices_scores]\n if len(indices) > 0: # If there are any rows found over the threshold\n rows_to_remove.update(list(indices))\n\n # Remove duplicates and get the list of rows to remove\n for idx in rows_to_remove:\n inputs[idx][\"keep_row_after_embedding_filtering\"] = False\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.MinHashDedup","title":"MinHashDedup ","text":" Bases: Step Deduplicates text using MinHash and MinHashLSH . MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH . 4. Check if the MinHash is already in the LSH , if so, it is a duplicate. Attributes: Name Type Description num_perm int the number of permutations to use. Defaults to 128 . seed int the seed to use for the MinHash. Defaults to 1 . tokenizer Literal['words', 'ngrams'] the tokenizer to use. Available ones are words or ngrams . If words is selected, it tokenizes the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n ). Defaults to words . n Optional[int] the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\" . Defaults to 5 . threshold float the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9 . storage Literal['dict', 'disk'] the storage to use for the LSH. Can be dict to store the index in memory, or disk . Keep in mind, disk is an experimental feature not defined in datasketch , that is based on DiskCache's Index class. It should work as a dict , but backed by disk, but depending on the system it can be slower. Defaults to dict . Input columns - text (
str ): the texts to be filtered. Output columns - keep_row_after_minhash_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. Categories References datasketch documentation - Identifying and Filtering Near-Duplicate Documents
- Diskcache's Index
Examples: Deduplicate a list of texts using MinHash and MinHashLSH:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n```\n Source code in src/distilabel/steps/filtering/minhash.py class MinHashDedup(Step):\n \"\"\"Deduplicates text using `MinHash` and `MinHashLSH`.\n\n `MinHashDedup` is a Step that detects near-duplicates in datasets. The idea roughly translates\n to the following steps:\n 1. Tokenize the text into words or ngrams.\n 2. Create a `MinHash` for each text.\n 3. Store the `MinHashes` in a `MinHashLSH`.\n 4. Check if the `MinHash` is already in the `LSH`, if so, it is a duplicate.\n\n Attributes:\n num_perm: the number of permutations to use. Defaults to `128`.\n seed: the seed to use for the MinHash. Defaults to `1`.\n tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.\n If `words` is selected, it tokenizes the text into words using nltk's\n word tokenizer. `ngram` estimates the ngrams (together with the size\n `n`). Defaults to `words`.\n n: the size of the ngrams to use. Only relevant if `tokenizer=\"ngrams\"`. Defaults to `5`.\n threshold: the threshold to consider two MinHashes as duplicates.\n Values closer to 0 detect more duplicates. Defaults to `0.9`.\n storage: the storage to use for the LSH. Can be `dict` to store the index\n in memory, or `disk`. Keep in mind, `disk` is an experimental feature\n not defined in `datasketch`, that is based on DiskCache's `Index` class.\n It should work as a `dict`, but backed by disk, but depending on the system\n it can be slower. Defaults to `dict`.\n\n Input columns:\n - text (`str`): the texts to be filtered.\n\n Output columns:\n - keep_row_after_minhash_filtering (`bool`): boolean indicating if the piece `text` is\n not a duplicate i.e. this text should be kept.\n\n Categories:\n - filtering\n\n References:\n - [`datasketch documentation`](https://ekzhu.github.io/datasketch/lsh.html)\n - [Identifying and Filtering Near-Duplicate Documents](https://cs.brown.edu/courses/cs253/papers/nearduplicate.pdf)\n - [Diskcache's Index](https://grantjenks.com/docs/diskcache/api.html#diskcache.Index)\n\n Examples:\n\n Deduplicate a list of texts using MinHash and MinHashLSH:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import MinHashDedup\n from distilabel.steps import LoadDataFromDicts\n\n with Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\n if __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n ```\n \"\"\"\n\n num_perm: int = 128\n seed: int = 1\n tokenizer: Literal[\"words\", \"ngrams\"] = \"words\"\n n: Optional[int] = 5\n threshold: float = 0.9\n storage: Literal[\"dict\", \"disk\"] = \"dict\"\n\n _hasher: Union[\"MinHash\", None] = PrivateAttr(None)\n _tokenizer: Union[Callable, None] = PrivateAttr(None)\n _lhs: Union[\"MinHashLSH\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n if not importlib.import_module(\"datasketch\"):\n raise ImportError(\n \"`datasketch` is needed to deduplicate with MinHash, but is not installed. \"\n \"Please install it using `pip install datasketch`.\"\n )\n from datasketch import MinHash\n\n from distilabel.steps.filtering._datasketch import MinHashLSH\n\n self._hasher = MinHash.bulk\n self._lsh = MinHashLSH(\n num_perm=self.num_perm,\n threshold=self.threshold,\n storage_config={\"type\": self.storage},\n )\n\n if self.tokenizer == \"words\":\n if not importlib.import_module(\"nltk\"):\n raise ImportError(\n \"`nltk` is needed to tokenize based on words, but is not installed. \"\n \"Please install it using `pip install nltk`. Then run `nltk.download('punkt_tab')`.\"\n )\n self._tokenizer = tokenized_on_words\n else:\n self._tokenizer = partial(tokenize_on_ngrams, n=self.n)\n\n def unload(self) -> None:\n super().unload()\n # In case of LSH being stored in disk, we need to close the file.\n if self.storage == \"disk\":\n self._lsh.close()\n\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"keep_row_after_minhash_filtering\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n tokenized_texts = []\n for input in inputs:\n tokenized_texts.append(self._tokenizer([input[self.inputs[0]]])[0])\n\n minhashes = self._hasher(\n tokenized_texts, num_perm=self.num_perm, seed=self.seed\n )\n\n for input, minhash in zip(inputs, minhashes):\n # Check if the text is already in the LSH index\n if self._lsh.query(minhash):\n input[\"keep_row_after_minhash_filtering\"] = False\n else:\n self._lsh.insert(str(uuid.uuid4()), minhash)\n input[\"keep_row_after_minhash_filtering\"] = True\n\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate","title":"ConversationTemplate ","text":" Bases: Step Generate a conversation template from an instruction and a response. Input columns - instruction (
str ): The instruction to be used in the conversation. - response (
str ): The response to be used in the conversation. Output columns - conversation (
ChatType ): The conversation template. Categories Examples: Create a conversation from an instruction and a response: from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n Source code in src/distilabel/steps/formatting/conversation.py class ConversationTemplate(Step):\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Input columns:\n - instruction (`str`): The instruction to be used in the conversation.\n - response (`str`): The response to be used in the conversation.\n\n Output columns:\n - conversation (`ChatType`): The conversation template.\n\n Categories:\n - format\n - chat\n - template\n\n Examples:\n Create a conversation from an instruction and a response:\n\n ```python\n from distilabel.steps import ConversationTemplate\n\n conv_template = ConversationTemplate()\n conv_template.load()\n\n result = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The instruction and response.\"\"\"\n return [\"instruction\", \"response\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The conversation template.\"\"\"\n return [\"conversation\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Args:\n inputs: The input data.\n\n Yields:\n The input data with the conversation template.\n \"\"\"\n for input in inputs:\n input[\"conversation\"] = [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.inputs","title":"inputs: StepColumns property ","text":"The instruction and response. "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.outputs","title":"outputs: StepColumns property ","text":"The conversation template. "},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.process","title":"process(inputs) ","text":"Generate a conversation template from an instruction and a response. Parameters: Name Type Description Default inputs StepInput The input data. required Yields: Type Description StepOutput The input data with the conversation template. Source code in src/distilabel/steps/formatting/conversation.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generate a conversation template from an instruction and a response.\n\n Args:\n inputs: The input data.\n\n Yields:\n The input data with the conversation template.\n \"\"\"\n for input in inputs:\n input[\"conversation\"] = [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO","title":"FormatChatGenerationDPO ","text":" Bases: Step Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook ., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Note The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. Input columns - messages (
List[Dict[str, str]] ): The conversation messages. - generations (
List[str] ): The generations produced by the LLM . - generation_models (
List[str] , optional): The model names used to generate the generations , only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. - ratings (
List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . Output columns - prompt (
str ): The user message used to generate the generations with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - chosen (
List[Dict[str, str]] ): The chosen generation based on the ratings . - chosen_model (
str , optional): The model name used to generate the chosen generation, if the generation_models are available. - chosen_rating (
float ): The rating of the chosen generation. - rejected (
List[Dict[str, str]] ): The rejected generation based on the ratings . - rejected_model (
str , optional): The model name used to generate the rejected generation, if the generation_models are available. - rejected_rating (
float ): The rating of the rejected generation. Categories - format
- chat-generation
- preference
- messages
- generations
Examples: Format your dataset for DPO fine tuning: from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n Source code in src/distilabel/steps/formatting/dpo.py class FormatChatGenerationDPO(Step):\n \"\"\"Format the output of a combination of a `ChatGeneration` + a preference task for Direct Preference Optimization (DPO).\n\n `FormatChatGenerationDPO` is a `Step` that formats the output of the combination of a `ChatGeneration`\n task with a preference `Task` i.e. a task generating `ratings` such as `UltraFeedback` following the standard\n formatting from frameworks such as `axolotl` or `alignment-handbook`., so that those are used to rank the\n existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n\n Note:\n The `messages` column should contain at least one message from the user, the `generations`\n column should contain at least two generations, the `ratings` column should contain the same\n number of ratings as generations.\n\n Input columns:\n - messages (`List[Dict[str, str]]`): The conversation messages.\n - generations (`List[str]`): The generations produced by the `LLM`.\n - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n only available if the `model_name` from the `ChatGeneration` task/s is combined into a single\n column named this way, otherwise, it will be ignored.\n - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n task such as `UltraFeedback`.\n\n Output columns:\n - prompt (`str`): The user message used to generate the `generations` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n if the `generation_models` are available.\n - chosen_rating (`float`): The rating of the `chosen` generation.\n - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n if the `generation_models` are available.\n - rejected_rating (`float`): The rating of the `rejected` generation.\n\n Categories:\n - format\n - chat-generation\n - preference\n - messages\n - generations\n\n Examples:\n Format your dataset for DPO fine tuning:\n\n ```python\n from distilabel.steps import FormatChatGenerationDPO\n\n format_dpo = FormatChatGenerationDPO()\n format_dpo.load()\n\n # NOTE: \"generation_models\" can be added optionally.\n result = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n # 'generations': ['4', '5', '6'],\n # 'ratings': [1, 0, -1],\n # 'prompt': \"What's 2+2?\",\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'chosen_rating': 1,\n # 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n # 'rejected_rating': -1\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `messages`, `generations`,\n and `ratings`.\"\"\"\n return [\"messages\", \"generations\", \"ratings\"]\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case is: `generation_models`.\"\"\"\n return [\"generation_models\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n is available.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\n \"prompt\",\n \"prompt_id\",\n \"chosen\",\n \"chosen_model\",\n \"chosen_rating\",\n \"rejected\",\n \"rejected_model\",\n \"rejected_rating\",\n ]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: messages , generations , and ratings . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case is: generation_models . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating . Both the chosen_model and rejected_model being optional and only used if generation_models is available. Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the DPO formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the DPO standard. Source code in src/distilabel/steps/formatting/dpo.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = item[\"messages\"] + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO","title":"FormatTextGenerationDPO ","text":" Bases: Step Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings , so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook . Note The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generations , if available. - instruction (
str ): The instruction used to generate the generations with the LLM . - generations (
List[str] ): The generations produced by the LLM . - generation_models (
List[str] , optional): The model names used to generate the generations , only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. - ratings (
List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . Output columns - prompt (
str ): The instruction used to generate the generations with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - chosen (
List[Dict[str, str]] ): The chosen generation based on the ratings . - chosen_model (
str , optional): The model name used to generate the chosen generation, if the generation_models are available. - chosen_rating (
float ): The rating of the chosen generation. - rejected (
List[Dict[str, str]] ): The rejected generation based on the ratings . - rejected_model (
str , optional): The model name used to generate the rejected generation, if the generation_models are available. - rejected_rating (
float ): The rating of the rejected generation. Categories - format
- text-generation
- preference
- instruction
- generations
Examples: Format your dataset for DPO fine tuning: from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# { 'instruction': \"What's 2+2?\",\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n Source code in src/distilabel/steps/formatting/dpo.py class FormatTextGenerationDPO(Step):\n \"\"\"Format the output of your LLMs for Direct Preference Optimization (DPO).\n\n `FormatTextGenerationDPO` is a `Step` that formats the output of the combination of a `TextGeneration`\n task with a preference `Task` i.e. a task generating `ratings`, so that those are used to rank the\n existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n Use this step to transform the output of a combination of a `TextGeneration` + a preference task such as\n `UltraFeedback` following the standard formatting from frameworks such as `axolotl` or `alignment-handbook`.\n\n Note:\n The `generations` column should contain at least two generations, the `ratings` column should\n contain the same number of ratings as generations.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generations`, if available.\n - instruction (`str`): The instruction used to generate the `generations` with the `LLM`.\n - generations (`List[str]`): The generations produced by the `LLM`.\n - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n only available if the `model_name` from the `TextGeneration` task/s is combined into a single\n column named this way, otherwise, it will be ignored.\n - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n task such as `UltraFeedback`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generations` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n if the `generation_models` are available.\n - chosen_rating (`float`): The rating of the `chosen` generation.\n - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n if the `generation_models` are available.\n - rejected_rating (`float`): The rating of the `rejected` generation.\n\n Categories:\n - format\n - text-generation\n - preference\n - instruction\n - generations\n\n Examples:\n Format your dataset for DPO fine tuning:\n\n ```python\n from distilabel.steps import FormatTextGenerationDPO\n\n format_dpo = FormatTextGenerationDPO()\n format_dpo.load()\n\n # NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\n result = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n )\n # >>> result\n # [\n # { 'instruction': \"What's 2+2?\",\n # 'generations': ['4', '5', '6'],\n # 'ratings': [1, 0, -1],\n # 'prompt': \"What's 2+2?\",\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'chosen_rating': 1,\n # 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n # 'rejected_rating': -1\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, `generations`,\n and `ratings`.\"\"\"\n return {\n \"system_prompt\": False,\n \"instruction\": True,\n \"generations\": True,\n \"generation_models\": False,\n \"ratings\": True,\n }\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case are: `system_prompt`, and `generation_models`.\"\"\"\n return [\"system_prompt\", \"generation_models\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n is available.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\n \"prompt\",\n \"prompt_id\",\n \"chosen\",\n \"chosen_model\",\n \"chosen_rating\",\n \"rejected\",\n \"rejected_model\",\n \"rejected_rating\",\n ]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n messages = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n messages.insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n item[\"prompt\"] = item[\"instruction\"]\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , generations , and ratings . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case are: system_prompt , and generation_models . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating . Both the chosen_model and rejected_model being optional and only used if generation_models is available. Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the DPO formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the DPO standard. Source code in src/distilabel/steps/formatting/dpo.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the DPO formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n \"\"\"\n for input in inputs:\n for item in input:\n messages = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n messages.insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n item[\"prompt\"] = item[\"instruction\"]\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"chosen\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][chosen_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n item[\"rejected\"] = messages + [\n {\n \"role\": \"assistant\",\n \"content\": item[\"generations\"][rejected_idx],\n }\n ]\n if \"generation_models\" in item:\n item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT","title":"FormatChatGenerationSFT ","text":" Bases: Step Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generation , if available. - instruction (
str ): The instruction used to generate the generation with the LLM . - generation (
str ): The generation produced by the LLM . Output columns - prompt (
str ): The instruction used to generate the generation with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - messages (
List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. Categories - format
- chat-generation
- instruction
- generation
Examples: Format your dataset for SFT: from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# }\n# ]\n Source code in src/distilabel/steps/formatting/sft.py class FormatChatGenerationSFT(Step):\n \"\"\"Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT).\n\n `FormatChatGenerationSFT` is a `Step` that formats the output of a `ChatGeneration` task for\n Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n or `alignment-handbook`. The output of the `ChatGeneration` task is formatted into a chat-like\n conversation with the `instruction` as the user message and the `generation` as the assistant\n message. Optionally, if the `system_prompt` is available, it is included as the first message\n in the conversation.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generation`, if available.\n - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n - generation (`str`): The generation produced by the `LLM`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n the user message and the `generation` as the assistant message.\n\n Categories:\n - format\n - chat-generation\n - instruction\n - generation\n\n Examples:\n Format your dataset for SFT:\n\n ```python\n from distilabel.steps import FormatChatGenerationSFT\n\n format_sft = FormatChatGenerationSFT()\n format_sft.load()\n\n # NOTE: \"system_prompt\" can be added optionally.\n result = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n # 'generation': '4',\n # 'prompt': 'What's 2+2?',\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n return [\"messages\", \"generation\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\"prompt\", \"prompt_id\", \"messages\"]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = item[\"messages\"] + [\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , and generation . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , messages . Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the SFT formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the SFT standard. Source code in src/distilabel/steps/formatting/sft.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = next(\n (\n turn[\"content\"]\n for turn in item[\"messages\"]\n if turn[\"role\"] == \"user\"\n ),\n None,\n )\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = item[\"messages\"] + [\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT","title":"FormatTextGenerationSFT ","text":" Bases: Step Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. Input columns - system_prompt (
str , optional): The system prompt used within the LLM to generate the generation , if available. - instruction (
str ): The instruction used to generate the generation with the LLM . - generation (
str ): The generation produced by the LLM . Output columns - prompt (
str ): The instruction used to generate the generation with the LLM . - prompt_id (
str ): The SHA256 hash of the prompt . - messages (
List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. Categories - format
- text-generation
- instruction
- generation
Examples: Format your dataset for SFT fine tuning: from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'What's 2+2?',\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n# }\n# ]\n Source code in src/distilabel/steps/formatting/sft.py class FormatTextGenerationSFT(Step):\n \"\"\"Format the output of a `TextGeneration` task for Supervised Fine-Tuning (SFT).\n\n `FormatTextGenerationSFT` is a `Step` that formats the output of a `TextGeneration` task for\n Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n or `alignment-handbook`. The output of the `TextGeneration` task is formatted into a chat-like\n conversation with the `instruction` as the user message and the `generation` as the assistant\n message. Optionally, if the `system_prompt` is available, it is included as the first message\n in the conversation.\n\n Input columns:\n - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n `generation`, if available.\n - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n - generation (`str`): The generation produced by the `LLM`.\n\n Output columns:\n - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n the user message and the `generation` as the assistant message.\n\n Categories:\n - format\n - text-generation\n - instruction\n - generation\n\n Examples:\n Format your dataset for SFT fine tuning:\n\n ```python\n from distilabel.steps import FormatTextGenerationSFT\n\n format_sft = FormatTextGenerationSFT()\n format_sft.load()\n\n # NOTE: \"system_prompt\" can be added optionally.\n result = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n )\n # >>> result\n # [\n # {\n # 'instruction': 'What's 2+2?',\n # 'generation': '4',\n # 'prompt': 'What's 2+2?',\n # 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n # 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n return {\n \"system_prompt\": False,\n \"instruction\": True,\n \"generation\": True,\n }\n\n @property\n def optional_inputs(self) -> List[str]:\n \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n which in this case is: `system_prompt`.\"\"\"\n return [\"system_prompt\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n Reference:\n - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n \"\"\"\n return [\"prompt\", \"prompt_id\", \"messages\"]\n\n def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = item[\"instruction\"]\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n item[\"messages\"].insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.inputs","title":"inputs: StepColumns property ","text":"List of inputs required by the Step , which in this case are: instruction , and generation . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.optional_inputs","title":"optional_inputs: List[str] property ","text":"List of optional inputs, which are not required by the Step but used if available, which in this case is: system_prompt . "},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.outputs","title":"outputs: StepColumns property ","text":"List of outputs generated by the Step , which are: prompt , prompt_id , messages . Reference - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.process","title":"process(*inputs) ","text":"The process method formats the received StepInput or list of StepInput according to the SFT formatting standard. Parameters: Name Type Description Default *inputs StepInput A list of StepInput to be combined. () Yields: Type Description StepOutput A StepOutput with batches of formatted StepInput following the SFT standard. Source code in src/distilabel/steps/formatting/sft.py def process(self, *inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n according to the SFT formatting standard.\n\n Args:\n *inputs: A list of `StepInput` to be combined.\n\n Yields:\n A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n \"\"\"\n for input in inputs:\n for item in input:\n item[\"prompt\"] = item[\"instruction\"]\n\n item[\"prompt_id\"] = hashlib.sha256(\n item[\"prompt\"].encode(\"utf-8\") # type: ignore\n ).hexdigest()\n\n item[\"messages\"] = [\n {\"role\": \"user\", \"content\": item[\"instruction\"]}, # type: ignore\n {\"role\": \"assistant\", \"content\": item[\"generation\"]}, # type: ignore\n ]\n if (\n \"system_prompt\" in item\n and isinstance(item[\"system_prompt\"], str) # type: ignore\n and len(item[\"system_prompt\"]) > 0 # type: ignore\n ):\n item[\"messages\"].insert(\n 0,\n {\"role\": \"system\", \"content\": item[\"system_prompt\"]}, # type: ignore\n )\n\n yield input\n "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts","title":"LoadDataFromDicts ","text":" Bases: GeneratorStep Loads a dataset from a list of dictionaries. GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches. Attributes: Name Type Description data List[Dict[str, Any]] The list of dictionaries to load the data from. Runtime parameters batch_size : The batch size to use when processing the data. Output columns - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories Examples: Load data from a list of dictionaries: from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n Source code in src/distilabel/steps/generators/data.py class LoadDataFromDicts(GeneratorStep):\n \"\"\"Loads a dataset from a list of dictionaries.\n\n `GeneratorStep` that loads a dataset from a list of dictionaries and yields it in\n batches.\n\n Attributes:\n data: The list of dictionaries to load the data from.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n\n Output columns:\n - dynamic (based on the keys found on the first dictionary of the list): The columns\n of the dataset.\n\n Categories:\n - load\n\n Examples:\n Load data from a list of dictionaries:\n\n ```python\n from distilabel.steps import LoadDataFromDicts\n\n loader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n )\n loader.load()\n\n result = next(loader.process())\n # >>> result\n # ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n ```\n \"\"\"\n\n data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n if offset:\n self.data = self.data[offset:]\n\n while self.data:\n batch = self.data[: self.batch_size]\n self.data = self.data[self.batch_size :]\n yield (\n batch,\n True if len(self.data) == 0 else False,\n )\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"Returns a list of strings with the names of the columns that the step will generate.\"\"\"\n return list(self.data[0].keys())\n "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.outputs","title":"outputs: List[str] property ","text":"Returns a list of strings with the names of the columns that the step will generate. "},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.process","title":"process(offset=0) ","text":"Yields batches from a list of dictionaries. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries as read from the inputs (propagated in batches) GeneratorStepOutput and a flag indicating whether the yield batch is the last one. Source code in src/distilabel/steps/generators/data.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n if offset:\n self.data = self.data[offset:]\n\n while self.data:\n batch = self.data[: self.batch_size]\n self.data = self.data[self.batch_size :]\n yield (\n batch,\n True if len(self.data) == 0 else False,\n )\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler","title":"DataSampler ","text":" Bases: GeneratorStep Step to sample from a dataset. GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples. Attributes: Name Type Description data List[Dict[str, Any]] The list of dictionaries to sample from. size int Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2. samples int Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100. Output columns - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories Examples: Sample data from a list of dictionaries: from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n Pipeline with a loader and a sampler combined in a single stream: from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n# Now we have a single stream of data with the loader and the sampler data\n Source code in src/distilabel/steps/generators/data_sampler.py class DataSampler(GeneratorStep):\n \"\"\"Step to sample from a dataset.\n\n `GeneratorStep` that samples from a dataset and yields it in batches.\n This step is useful when you have a pipeline that can benefit from using examples\n in the prompts for example as few-shot learning, that can be changing on each row.\n For example, you can pass a list of dictionaries with N examples and generate M samples\n from it (assuming you have another step loading data, this M should have the same size\n as the data being loaded in that step). The size S argument is the number of samples per\n row generated, so each example would contain S examples to be used as examples.\n\n Attributes:\n data: The list of dictionaries to sample from.\n size: Number of samples per example. For example in a few-shot learning scenario,\n the number of few-shot examples that will be generated per example. Defaults to 2.\n samples: Number of examples that will be generated by the step in total.\n If used with another loader step, this should be the same as the number\n of samples in the loader step. Defaults to 100.\n\n Output columns:\n - dynamic (based on the keys found on the first dictionary of the list): The columns\n of the dataset.\n\n Categories:\n - load\n\n Examples:\n Sample data from a list of dictionaries:\n\n ```python\n from distilabel.steps import DataSampler\n\n sampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n )\n sampler.load()\n\n result = next(sampler.process())\n # >>> result\n # ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n ```\n\n Pipeline with a loader and a sampler combined in a single stream:\n\n ```python\n from datasets import load_dataset\n\n from distilabel.steps import LoadDataFromDicts, DataSampler\n from distilabel.steps.tasks.apigen.utils import PrepareExamples\n from distilabel.pipeline import Pipeline\n\n ds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n )\n data = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n ]\n with Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n # Now we have a single stream of data with the loader and the sampler data\n ```\n \"\"\"\n\n data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n size: int = Field(\n default=2,\n description=(\n \"Number of samples per example. For example in a few-shot learning scenario, the number \"\n \"of few-shot examples that will be generated per example.\"\n ),\n )\n samples: int = Field(\n default=100,\n description=(\n \"Number of examples that will be generated by the step in total. \"\n \"If used with another loader step, this should be the same as the number of \"\n \"samples in the loader step.\"\n ),\n )\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n\n total_samples = 0\n\n while total_samples < self.samples:\n batch = []\n bs = min(self.batch_size, self.samples - total_samples)\n for _ in range(self.batch_size):\n choices = random.choices(self.data, k=self.size)\n choices = self._transform_data(choices)\n batch.extend(choices)\n total_samples += bs\n batch = list(islice(batch, bs))\n yield (batch, True if total_samples >= self.samples else False)\n batch = []\n\n @staticmethod\n def _transform_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n if not data:\n return []\n\n result = {key: [] for key in data[0].keys()}\n\n for item in data:\n for key, value in item.items():\n result[key].append(value)\n\n return [result]\n\n @property\n def outputs(self) -> List[str]:\n return list(self.data[0].keys())\n "},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler.process","title":"process(offset=0) ","text":"Yields batches from a list of dictionaries. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries as read from the inputs (propagated in batches) GeneratorStepOutput and a flag indicating whether the yield batch is the last one. Source code in src/distilabel/steps/generators/data_sampler.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # type: ignore\n \"\"\"Yields batches from a list of dictionaries.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n A list of Python dictionaries as read from the inputs (propagated in batches)\n and a flag indicating whether the yield batch is the last one.\n \"\"\"\n\n total_samples = 0\n\n while total_samples < self.samples:\n batch = []\n bs = min(self.batch_size, self.samples - total_samples)\n for _ in range(self.batch_size):\n choices = random.choices(self.data, k=self.size)\n choices = self._transform_data(choices)\n batch.extend(choices)\n total_samples += bs\n batch = list(islice(batch, bs))\n yield (batch, True if total_samples >= self.samples else False)\n batch = []\n "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore","title":"RewardModelScore ","text":" Bases: Step , CudaDevicePlacementMixin Assign a score to a response using a Reward Model. RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers , assigns an score to a response generated for an instruction, or a score to a multi-turn conversation. Attributes: Name Type Description model str the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. revision str if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . torch_dtype str the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . trust_remote_code bool whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . device_map Union[str, Dict[str, Any], None] a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . token Union[SecretStr, None] the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . truncation bool whether to truncate sequences at the maximum length. Defaults to False . max_length Union[int, None] maximun length to use for padding or truncation. Defaults to None . Input columns - instruction (
str , optional): the instruction used to generate a response . If provided, then response must be provided too. - response (
str , optional): the response generated for instruction . If provided, then instruction must be provide too. - conversation (
ChatType , optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided. Output columns - score (
float ): the score given by the reward model for the instruction-response pair or the conversation. Categories Examples: Assigning an score for an instruction-response pair: from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n)\n# [\n# {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n# {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n Assigning an score for a multi-turn conversation: from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n)\n# [\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n Source code in src/distilabel/steps/reward_model.py class RewardModelScore(Step, CudaDevicePlacementMixin):\n \"\"\"Assign a score to a response using a Reward Model.\n\n `RewardModelScore` is a `Step` that using a Reward Model (RM) loaded using `transformers`,\n assigns an score to a response generated for an instruction, or a score to a multi-turn\n conversation.\n\n Attributes:\n model: the model Hugging Face Hub repo id or a path to a directory containing the\n model weights and configuration files.\n revision: if `model` refers to a Hugging Face Hub repository, then the revision\n (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n Defaults to `\"auto\"`.\n trust_remote_code: whether to allow fetching and executing remote code fetched\n from the repository in the Hub. Defaults to `False`.\n device_map: a dictionary mapping each layer of the model to a device, or a mode like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n local configuration will be used. Defaults to `None`.\n truncation: whether to truncate sequences at the maximum length. Defaults to `False`.\n max_length: maximun length to use for padding or truncation. Defaults to `None`.\n\n Input columns:\n - instruction (`str`, optional): the instruction used to generate a `response`.\n If provided, then `response` must be provided too.\n - response (`str`, optional): the response generated for `instruction`. If provided,\n then `instruction` must be provide too.\n - conversation (`ChatType`, optional): a multi-turn conversation. If not provided,\n then `instruction` and `response` columns must be provided.\n\n Output columns:\n - score (`float`): the score given by the reward model for the instruction-response\n pair or the conversation.\n\n Categories:\n - scorer\n\n Examples:\n Assigning an score for an instruction-response pair:\n\n ```python\n from distilabel.steps import RewardModelScore\n\n step = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n )\n\n step.load()\n\n result = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n )\n # [\n # {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n # {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n # ]\n ```\n\n Assigning an score for a multi-turn conversation:\n\n ```python\n from distilabel.steps import RewardModelScore\n\n step = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n )\n\n step.load()\n\n result = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n )\n # [\n # {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n # {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n # ]\n ```\n \"\"\"\n\n model: str\n revision: str = \"main\"\n torch_dtype: str = \"auto\"\n trust_remote_code: bool = False\n device_map: Union[str, Dict[str, Any], None] = None\n token: Union[SecretStr, None] = Field(\n default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR), description=\"\"\n )\n truncation: bool = False\n max_length: Union[int, None] = None\n\n _model: Union[\"PreTrainedModel\", None] = PrivateAttr(None)\n _tokenizer: Union[\"PreTrainedTokenizer\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n super().load()\n\n if self.device_map in [\"cuda\", \"auto\"]:\n CudaDevicePlacementMixin.load(self)\n\n try:\n from transformers import AutoModelForSequenceClassification, AutoTokenizer\n except ImportError as e:\n raise ImportError(\n \"`transformers` is not installed. Please install it using `pip install transformers`.\"\n ) from e\n\n token = self.token.get_secret_value() if self.token is not None else self.token\n\n self._model = AutoModelForSequenceClassification.from_pretrained(\n self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n device_map=self.device_map,\n token=token,\n )\n self._tokenizer = AutoTokenizer.from_pretrained(\n self.model,\n revision=self.revision,\n torch_dtype=self.torch_dtype,\n trust_remote_code=self.trust_remote_code,\n token=token,\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"Either `response` and `instruction`, or a `conversation` columns.\"\"\"\n return {\n \"response\": False,\n \"instruction\": False,\n \"conversation\": False,\n }\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The `score` given by the reward model.\"\"\"\n return [\"score\"]\n\n def _prepare_conversation(self, input: Dict[str, Any]) -> \"ChatType\":\n if \"instruction\" in input and \"response\" in input:\n return [\n {\"role\": \"user\", \"content\": input[\"instruction\"]},\n {\"role\": \"assistant\", \"content\": input[\"response\"]},\n ]\n\n return input[\"conversation\"]\n\n def _prepare_inputs(self, inputs: List[Dict[str, Any]]) -> \"torch.Tensor\":\n return self._tokenizer.apply_chat_template( # type: ignore\n [self._prepare_conversation(input) for input in inputs], # type: ignore\n return_tensors=\"pt\",\n padding=True,\n truncation=self.truncation,\n max_length=self.max_length,\n ).to(self._model.device) # type: ignore\n\n def _inference(self, inputs: List[Dict[str, Any]]) -> List[float]:\n import torch\n\n input_ids = self._prepare_inputs(inputs)\n with torch.no_grad():\n output = self._model(input_ids) # type: ignore\n logits = output.logits\n if logits.shape == (2, 1):\n logits = logits.squeeze(-1)\n return logits.tolist()\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n scores = self._inference(inputs)\n for input, score in zip(inputs, scores):\n input[\"score\"] = score\n yield inputs\n\n def unload(self) -> None:\n if self.device_map in [\"cuda\", \"auto\"]:\n CudaDevicePlacementMixin.unload(self)\n super().unload()\n "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.inputs","title":"inputs: StepColumns property ","text":"Either response and instruction , or a conversation columns. "},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.outputs","title":"outputs: StepColumns property ","text":"The score given by the reward model. "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn","title":"TruncateTextColumn ","text":" Bases: Step Truncate a row using a tokenizer or the number of characters. TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length. Attributes: Name Type Description column str the column to truncate. Defaults to \"text\" . max_length int the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192 . tokenizer Optional[str] the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None . Input columns - dynamic (determined by
column attribute): The columns to be truncated, defaults to \"text\". Output columns - dynamic (determined by
column attribute): The truncated column. Categories Examples: Truncating a row to a given number of tokens: from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a sample'}]\n Truncating a row to a given number of characters: from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a '}]\n Source code in src/distilabel/steps/truncate.py class TruncateTextColumn(Step):\n \"\"\"Truncate a row using a tokenizer or the number of characters.\n\n `TruncateTextColumn` is a `Step` that truncates a row according to the max length. If\n the `tokenizer` is provided, then the row will be truncated using the tokenizer,\n and the `max_length` will be used as the maximum number of tokens, otherwise it will\n be used as the maximum number of characters. The `TruncateTextColumn` step is useful when one\n wants to truncate a row to a certain length, to avoid posterior errors in the model due\n to the length.\n\n Attributes:\n column: the column to truncate. Defaults to `\"text\"`.\n max_length: the maximum length to use for truncation.\n If a `tokenizer` is given, corresponds to the number of tokens,\n otherwise corresponds to the number of characters. Defaults to `8192`.\n tokenizer: the name of the tokenizer to use. If provided, the row will be\n truncated using the tokenizer. Defaults to `None`.\n\n Input columns:\n - dynamic (determined by `column` attribute): The columns to be truncated, defaults to \"text\".\n\n Output columns:\n - dynamic (determined by `column` attribute): The truncated column.\n\n Categories:\n - text-manipulation\n\n Examples:\n Truncating a row to a given number of tokens:\n\n ```python\n from distilabel.steps import TruncateTextColumn\n\n trunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n )\n\n trunc.load()\n\n result = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n )\n # result\n # [{'text': 'This is a sample'}]\n ```\n\n Truncating a row to a given number of characters:\n\n ```python\n from distilabel.steps import TruncateTextColumn\n\n trunc = TruncateTextColumn(max_length=10)\n\n trunc.load()\n\n result = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n )\n # result\n # [{'text': 'This is a '}]\n ```\n \"\"\"\n\n column: str = \"text\"\n max_length: int = 8192\n tokenizer: Optional[str] = None\n _truncator: Optional[Callable[[str], str]] = None\n _tokenizer: Optional[Any] = None\n\n def load(self):\n super().load()\n if self.tokenizer:\n if not importlib.util.find_spec(\"transformers\"):\n raise ImportError(\n \"`transformers` is needed to tokenize, but is not installed. \"\n \"Please install it using `pip install transformers`.\"\n )\n\n from transformers import AutoTokenizer\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)\n self._truncator = self._truncate_with_tokenizer\n else:\n self._truncator = self._truncate_with_length\n\n @property\n def inputs(self) -> List[str]:\n return [self.column]\n\n @property\n def outputs(self) -> List[str]:\n return self.inputs\n\n def _truncate_with_length(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters.\"\"\"\n return text[: self.max_length]\n\n def _truncate_with_tokenizer(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n return self._tokenizer.decode(\n self._tokenizer.encode(\n text,\n add_special_tokens=False,\n max_length=self.max_length,\n truncation=True,\n )\n )\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[self.column] = self._truncator(input[self.column])\n yield inputs\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_length","title":"_truncate_with_length(text) ","text":"Truncates the text according to the number of characters. Source code in src/distilabel/steps/truncate.py def _truncate_with_length(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters.\"\"\"\n return text[: self.max_length]\n "},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_tokenizer","title":"_truncate_with_tokenizer(text) ","text":"Truncates the text according to the number of characters using the tokenizer. Source code in src/distilabel/steps/truncate.py def _truncate_with_tokenizer(self, text: str) -> str:\n \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n return self._tokenizer.decode(\n self._tokenizer.encode(\n text,\n add_special_tokens=False,\n max_length=self.max_length,\n truncation=True,\n )\n )\n "},{"location":"api/step_gallery/hugging_face/","title":"Hugging Face","text":"This section contains the existing steps integrated with Hugging Face so as to easily push the generated datasets to Hugging Face. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk","title":"LoadDataFromDisk ","text":" Bases: LoadDataFromHub Load a dataset that was previously saved to disk. If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class. Attributes: Name Type Description dataset_path RuntimeParameter[Union[str, Path]] The path to the dataset or distiset. split Optional[RuntimeParameter[str]] The split of the dataset to load (typically will be train , test or validation ). config Optional[RuntimeParameter[str]] The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. Runtime parameters batch_size : The batch size to use when processing the data. dataset_path : The path to the dataset or distiset. is_distiset : Whether the dataset to load is a Distiset or not. Defaults to False. split : The split of the dataset to load. Defaults to 'train'. config : The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a Hugging Face Dataset: from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data from a distilabel Distiset: from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n Load data from a Hugging Face Dataset or Distiset in your cloud provider: from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromDisk(LoadDataFromHub):\n \"\"\"Load a dataset that was previously saved to disk.\n\n If you previously saved your dataset using the `save_to_disk` method, or\n `Distiset.save_to_disk` you can load it again to build a new pipeline using this class.\n\n Attributes:\n dataset_path: The path to the dataset or distiset.\n split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n config: The configuration of the dataset to load. Defaults to `default`, if there are\n multiple configurations in the dataset this must be suplied or an error is raised.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `dataset_path`: The path to the dataset or distiset.\n - `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `config`: The configuration of the dataset to load. Defaults to `default`, if there are\n multiple configurations in the dataset this must be suplied or an error is raised.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a Hugging Face Dataset:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n loader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data from a distilabel Distiset:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n # Specify the configuration to load.\n loader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n ```\n\n Load data from a Hugging Face Dataset or Distiset in your cloud provider:\n\n ```python\n from distilabel.steps import LoadDataFromDisk\n\n loader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n \"\"\"\n\n dataset_path: RuntimeParameter[Union[str, Path]] = Field(\n default=None,\n description=\"Path to the dataset or distiset.\",\n )\n config: Optional[RuntimeParameter[str]] = Field(\n default=\"default\",\n description=(\n \"The configuration of the dataset to load. Will default to 'default'\",\n \" which corresponds to a distiset with a single configuration.\",\n ),\n )\n is_distiset: Optional[RuntimeParameter[bool]] = Field(\n default=False,\n description=\"Whether the dataset to load is a `Distiset` or not. Defaults to False.\",\n )\n keep_in_memory: Optional[RuntimeParameter[bool]] = Field(\n default=None,\n description=\"Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` \"\n \" for more information. Defaults to `None`.\",\n )\n split: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The split of the dataset to load. By default will load the whole Dataset/Distiset.\",\n )\n repo_id: ExcludedField[Union[str, None]] = None\n\n def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n super(GeneratorStep, self).load()\n if self.is_distiset:\n ds = Distiset.load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n if self.config not in ds.keys():\n raise DistilabelUserError(\n f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n \"of the available configurations.\\n\\n\",\n page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n )\n ds = ds[self.config]\n\n else:\n ds = load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n\n if self.split:\n ds = ds[self.split]\n\n self._dataset = ds\n\n if self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n else:\n self.num_examples = len(self._dataset)\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets from a file\n in disk.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n if self._dataset is None:\n self.load()\n\n return self._dataset.column_names\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets from a file in disk. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.load","title":"load() ","text":"Load the dataset from the file/s in disk. Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n super(GeneratorStep, self).load()\n if self.is_distiset:\n ds = Distiset.load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n if self.config not in ds.keys():\n raise DistilabelUserError(\n f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n \"of the available configurations.\\n\\n\",\n page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n )\n ds = ds[self.config]\n\n else:\n ds = load_from_disk(\n self.dataset_path,\n keep_in_memory=self.keep_in_memory,\n storage_options=self.storage_options,\n )\n\n if self.split:\n ds = ds[self.split]\n\n self._dataset = ds\n\n if self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n else:\n self.num_examples = len(self._dataset)\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem","title":"LoadDataFromFileSystem ","text":" Bases: LoadDataFromHub Loads a dataset from a file in your filesystem. GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types. Attributes: Name Type Description data_files RuntimeParameter[Union[str, Path]] The path to the file, or directory containing the files that conform the dataset. split RuntimeParameter[Union[str, Path]] The split of the dataset to load (typically will be train , test or validation ). Runtime parameters batch_size : The batch size to use when processing the data. data_files : The path to the file, or directory containing the files that conform the dataset. split : The split of the dataset to load. Defaults to 'train'. streaming : Whether to load the dataset in streaming mode or not. Defaults to False . num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . filetype : The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file. Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a Hugging Face dataset in your file system: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Specify a filetype if the file extension is not expected: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data from a file in your cloud provider: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Load data passing a glob pattern: from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromFileSystem(LoadDataFromHub):\n \"\"\"Loads a dataset from a file in your filesystem.\n\n `GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`\n library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)\n for more information of the supported file types.\n\n Attributes:\n data_files: The path to the file, or directory containing the files that conform\n the dataset.\n split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `data_files`: The path to the file, or directory containing the files that conform\n the dataset.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n `False`.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n - `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.\n For more than one file, it will be inferred from the first file.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a Hugging Face dataset in your file system:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Specify a filetype if the file extension is not expected:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data from a file in your cloud provider:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n\n Load data passing a glob pattern:\n\n ```python\n from distilabel.steps import LoadDataFromFileSystem\n\n loader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'type': 'function', 'function':...', False)\n ```\n \"\"\"\n\n data_files: RuntimeParameter[Union[str, Path]] = Field(\n default=None,\n description=\"The data files, or directory containing the data files, to generate the dataset from.\",\n )\n filetype: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The expected filetype. If not provided, it will be inferred from the file extension.\",\n )\n repo_id: ExcludedField[Union[str, None]] = None\n\n def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n GeneratorStep.load(self)\n\n data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n self._dataset = load_dataset(\n self.filetype,\n data_files=data_files,\n split=self.split,\n streaming=self.streaming,\n storage_options=self.storage_options,\n )\n\n if not self.streaming and self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n if not self.num_examples:\n if self.streaming:\n # There's no better way to get the number of examples in a streaming dataset,\n # load it again for the moment.\n self.num_examples = len(\n load_dataset(\n self.filetype, data_files=self.data_files, split=self.split\n )\n )\n else:\n self.num_examples = len(self._dataset)\n\n @staticmethod\n def _prepare_data_files( # noqa: C901\n data_path: UPath,\n ) -> Tuple[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], str]:\n \"\"\"Prepare the loading process by setting the `data_files` attribute.\n\n Args:\n data_path: The path to the data files, or directory containing the data files.\n\n Returns:\n Tuple with the data files and the filetype.\n \"\"\"\n\n def get_filetype(data_path: UPath) -> str:\n filetype = data_path.suffix.lstrip(\".\")\n if filetype == \"jsonl\":\n filetype = \"json\"\n return filetype\n\n if data_path.is_file() or (\n len(str(data_path.parent.glob(data_path.name))) >= 1\n ):\n filetype = get_filetype(data_path)\n data_files = str(data_path)\n\n elif data_path.is_dir():\n file_sequence = []\n file_map = defaultdict(list)\n for file_or_folder in data_path.iterdir():\n if file_or_folder.is_file():\n file_sequence.append(str(file_or_folder))\n elif file_or_folder.is_dir():\n for file in file_or_folder.iterdir():\n file_sequence.append(str(file))\n file_map[str(file_or_folder)].append(str(file))\n\n data_files = file_sequence or file_map\n # Try to obtain the filetype from any of the files, assuming all files have the same type.\n if file_sequence:\n filetype = get_filetype(UPath(file_sequence[0]))\n else:\n filetype = get_filetype(UPath(file_map[list(file_map.keys())[0]][0]))\n return data_files, filetype\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets from a file\n in disk.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n if self._dataset is None:\n self.load()\n\n return self._dataset.column_names\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets from a file in disk. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.load","title":"load() ","text":"Load the dataset from the file/s in disk. Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the file/s in disk.\"\"\"\n GeneratorStep.load(self)\n\n data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n self._dataset = load_dataset(\n self.filetype,\n data_files=data_files,\n split=self.split,\n streaming=self.streaming,\n storage_options=self.storage_options,\n )\n\n if not self.streaming and self.num_examples:\n self._dataset = self._dataset.select(range(self.num_examples))\n if not self.num_examples:\n if self.streaming:\n # There's no better way to get the number of examples in a streaming dataset,\n # load it again for the moment.\n self.num_examples = len(\n load_dataset(\n self.filetype, data_files=self.data_files, split=self.split\n )\n )\n else:\n self.num_examples = len(self._dataset)\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub","title":"LoadDataFromHub ","text":" Bases: GeneratorStep Loads a dataset from the Hugging Face Hub. GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library. Attributes: Name Type Description repo_id RuntimeParameter[str] The Hugging Face Hub repository ID of the dataset to load. split RuntimeParameter[str] The split of the dataset to load. config Optional[RuntimeParameter[str]] The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. Runtime parameters batch_size : The batch size to use when processing the data. repo_id : The Hugging Face Hub repository ID of the dataset to load. split : The split of the dataset to load. Defaults to 'train'. config : The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. revision : The revision of the dataset to load. Defaults to the latest revision. streaming : Whether to load the dataset in streaming mode or not. Defaults to False . num_examples : The number of examples to load from the dataset. By default will load all examples. storage_options : Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . Output columns - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Categories Examples: Load data from a dataset in Hugging Face Hub: from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n Source code in src/distilabel/steps/generators/huggingface.py class LoadDataFromHub(GeneratorStep):\n \"\"\"Loads a dataset from the Hugging Face Hub.\n\n `GeneratorStep` that loads a dataset from the Hugging Face Hub using the `datasets`\n library.\n\n Attributes:\n repo_id: The Hugging Face Hub repository ID of the dataset to load.\n split: The split of the dataset to load.\n config: The configuration of the dataset to load. This is optional and only needed\n if the dataset has multiple configurations.\n\n Runtime parameters:\n - `batch_size`: The batch size to use when processing the data.\n - `repo_id`: The Hugging Face Hub repository ID of the dataset to load.\n - `split`: The split of the dataset to load. Defaults to 'train'.\n - `config`: The configuration of the dataset to load. This is optional and only\n needed if the dataset has multiple configurations.\n - `revision`: The revision of the dataset to load. Defaults to the latest revision.\n - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n `False`.\n - `num_examples`: The number of examples to load from the dataset.\n By default will load all examples.\n - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n Defaults to `None`.\n\n Output columns:\n - dynamic (`all`): The columns that will be generated by this step, based on the\n datasets loaded from the Hugging Face Hub.\n\n Categories:\n - load\n\n Examples:\n Load data from a dataset in Hugging Face Hub:\n\n ```python\n from distilabel.steps import LoadDataFromHub\n\n loader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n )\n loader.load()\n\n # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n result = next(loader.process())\n # >>> result\n # ([{'prompt': 'Arianna has 12...', False)\n ```\n \"\"\"\n\n repo_id: RuntimeParameter[str] = Field(\n default=None,\n description=\"The Hugging Face Hub repository ID of the dataset to load.\",\n )\n split: RuntimeParameter[str] = Field(\n default=\"train\",\n description=\"The split of the dataset to load. Defaults to 'train'.\",\n )\n config: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The configuration of the dataset to load. This is optional and only\"\n \" needed if the dataset has multiple configurations.\",\n )\n revision: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The revision of the dataset to load. Defaults to the latest revision.\",\n )\n streaming: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether to load the dataset in streaming mode or not. Defaults to False.\",\n )\n num_examples: Optional[RuntimeParameter[int]] = Field(\n default=None,\n description=\"The number of examples to load from the dataset. By default will load all examples.\",\n )\n storage_options: Optional[Dict[str, Any]] = Field(\n default=None,\n description=\"The storage options to use when loading the dataset.\",\n )\n\n _dataset: Union[IterableDataset, Dataset, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n super().load()\n\n if self._dataset is not None:\n # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n return\n\n self._dataset = load_dataset(\n self.repo_id, # type: ignore\n self.config,\n split=self.split,\n revision=self.revision,\n streaming=self.streaming,\n )\n num_examples = self._get_dataset_num_examples()\n self.num_examples = (\n min(self.num_examples, num_examples) if self.num_examples else num_examples\n )\n\n if not self.streaming:\n self._dataset = self._dataset.select(range(self.num_examples))\n\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n Args:\n offset: The offset to start yielding the data from. Will be used during the caching\n process to help skipping already processed data.\n\n Yields:\n A tuple containing a batch of rows and a boolean indicating if the batch is\n the last one.\n \"\"\"\n num_returned_rows = 0\n for batch_num, batch in enumerate(\n self._dataset.iter(batch_size=self.batch_size) # type: ignore\n ):\n if batch_num * self.batch_size < offset:\n continue\n transformed_batch = self._transform_batch(batch)\n batch_size = len(transformed_batch)\n num_returned_rows += batch_size\n yield transformed_batch, num_returned_rows >= self.num_examples\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The columns that will be generated by this step, based on the datasets loaded\n from the Hugging Face Hub.\n\n Returns:\n The columns that will be generated by this step.\n \"\"\"\n return self._get_dataset_columns()\n\n def _transform_batch(self, batch: Dict[str, Any]) -> List[Dict[str, Any]]:\n \"\"\"Transform a batch of data from the Hugging Face Hub into a list of rows.\n\n Args:\n batch: The batch of data from the Hugging Face Hub.\n\n Returns:\n A list of rows, where each row is a dictionary of column names and values.\n \"\"\"\n length = len(next(iter(batch.values())))\n rows = []\n for i in range(length):\n rows.append({col: values[i] for col, values in batch.items()})\n return rows\n\n def _get_dataset_num_examples(self) -> int:\n \"\"\"Get the number of examples in the dataset, based on the `split` and `config`\n runtime parameters provided.\n\n Returns:\n The number of examples in the dataset.\n \"\"\"\n default_config = self.config\n if not default_config:\n default_config = list(self._dataset_info.keys())[0]\n\n return self._dataset_info[default_config].splits[self.split].num_examples\n\n def _get_dataset_columns(self) -> List[str]:\n \"\"\"Get the columns of the dataset, based on the `config` runtime parameter provided.\n\n Returns:\n The columns of the dataset.\n \"\"\"\n return list(\n self._dataset_info[\n self.config if self.config else \"default\"\n ].features.keys()\n )\n\n @cached_property\n def _dataset_info(self) -> Dict[str, DatasetInfo]:\n \"\"\"Calls the Datasets Server API from Hugging Face to obtain the dataset information.\n\n Returns:\n The dataset information.\n \"\"\"\n\n try:\n return get_dataset_infos(self.repo_id)\n except Exception as e:\n warnings.warn(\n f\"Failed to get dataset info from Hugging Face Hub, trying to get it loading the dataset. Error: {e}\",\n UserWarning,\n stacklevel=2,\n )\n ds = load_dataset(self.repo_id, config=self.config, split=self.split)\n if self.config:\n return ds[self.config].info\n return ds.info\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.outputs","title":"outputs: List[str] property ","text":"The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. Returns: Type Description List[str] The columns that will be generated by this step. "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.load","title":"load() ","text":"Load the dataset from the Hugging Face Hub Source code in src/distilabel/steps/generators/huggingface.py def load(self) -> None:\n \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n super().load()\n\n if self._dataset is not None:\n # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n return\n\n self._dataset = load_dataset(\n self.repo_id, # type: ignore\n self.config,\n split=self.split,\n revision=self.revision,\n streaming=self.streaming,\n )\n num_examples = self._get_dataset_num_examples()\n self.num_examples = (\n min(self.num_examples, num_examples) if self.num_examples else num_examples\n )\n\n if not self.streaming:\n self._dataset = self._dataset.select(range(self.num_examples))\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.process","title":"process(offset=0) ","text":"Yields batches from the loaded dataset from the Hugging Face Hub. Parameters: Name Type Description Default offset int The offset to start yielding the data from. Will be used during the caching process to help skipping already processed data. 0 Yields: Type Description GeneratorStepOutput A tuple containing a batch of rows and a boolean indicating if the batch is GeneratorStepOutput the last one. Source code in src/distilabel/steps/generators/huggingface.py def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n Args:\n offset: The offset to start yielding the data from. Will be used during the caching\n process to help skipping already processed data.\n\n Yields:\n A tuple containing a batch of rows and a boolean indicating if the batch is\n the last one.\n \"\"\"\n num_returned_rows = 0\n for batch_num, batch in enumerate(\n self._dataset.iter(batch_size=self.batch_size) # type: ignore\n ):\n if batch_num * self.batch_size < offset:\n continue\n transformed_batch = self._transform_batch(batch)\n batch_size = len(transformed_batch)\n num_returned_rows += batch_size\n yield transformed_batch, num_returned_rows >= self.num_examples\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub","title":"PushToHub ","text":" Bases: GlobalStep Push data to a Hugging Face Hub dataset. A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub. Attributes: Name Type Description repo_id RuntimeParameter[str] The Hugging Face Hub repository ID where the dataset will be uploaded. split RuntimeParameter[str] The split of the dataset that will be pushed. Defaults to \"train\" . private RuntimeParameter[bool] Whether the dataset to be pushed should be private or not. Defaults to False . token Optional[RuntimeParameter[str]] The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN . If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None . Runtime parameters repo_id : The Hugging Face Hub repository ID where the dataset will be uploaded. split : The split of the dataset that will be pushed. private : Whether the dataset to be pushed should be private or not. token : The token that will be used to authenticate in the Hub. Input columns - dynamic (
all ): all columns from the input will be used to create the dataset. Categories Examples: Push batches of your dataset to the Hugging Face Hub repository: from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n Source code in src/distilabel/steps/globals/huggingface.py class PushToHub(GlobalStep):\n \"\"\"Push data to a Hugging Face Hub dataset.\n\n A `GlobalStep` which creates a `datasets.Dataset` with the input data and pushes\n it to the Hugging Face Hub.\n\n Attributes:\n repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.\n split: The split of the dataset that will be pushed. Defaults to `\"train\"`.\n private: Whether the dataset to be pushed should be private or not. Defaults to\n `False`.\n token: The token that will be used to authenticate in the Hub. If not provided, the\n token will be tried to be obtained from the environment variable `HF_TOKEN`.\n If not provided using one of the previous methods, then `huggingface_hub` library\n will try to use the token from the local Hugging Face CLI configuration. Defaults\n to `None`.\n\n Runtime parameters:\n - `repo_id`: The Hugging Face Hub repository ID where the dataset will be uploaded.\n - `split`: The split of the dataset that will be pushed.\n - `private`: Whether the dataset to be pushed should be private or not.\n - `token`: The token that will be used to authenticate in the Hub.\n\n Input columns:\n - dynamic (`all`): all columns from the input will be used to create the dataset.\n\n Categories:\n - save\n - dataset\n - huggingface\n\n Examples:\n Push batches of your dataset to the Hugging Face Hub repository:\n\n ```python\n from distilabel.steps import PushToHub\n\n push = PushToHub(repo_id=\"path_to/repo\")\n push.load()\n\n result = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n )\n # >>> result\n # [{'instruction': 'instruction ', 'generation': 'generation'}]\n ```\n \"\"\"\n\n repo_id: RuntimeParameter[str] = Field(\n default=None,\n description=\"The Hugging Face Hub repository ID where the dataset will be uploaded.\",\n )\n split: RuntimeParameter[str] = Field(\n default=\"train\",\n description=\"The split of the dataset that will be pushed. Defaults to 'train'.\",\n )\n private: RuntimeParameter[bool] = Field(\n default=False,\n description=\"Whether the dataset to be pushed should be private or not. Defaults\"\n \" to `False`.\",\n )\n token: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The token that will be used to authenticate in the Hub. If not provided,\"\n \" the token will be tried to be obtained from the environment variable `HF_TOKEN`.\"\n \" If not provided using one of the previous methods, then `huggingface_hub` library\"\n \" will try to use the token from the local Hugging Face CLI configuration. Defaults\"\n \" to `None`\",\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n Args:\n inputs: that input data within a single object (as it's a GlobalStep) that\n will be transformed into a `datasets.Dataset`.\n\n Yields:\n Propagates the received inputs so that the `Distiset` can be generated if this is\n the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n steps.\n \"\"\"\n dataset_dict = defaultdict(list)\n for input in inputs:\n for key, value in input.items():\n dataset_dict[key].append(value)\n dataset_dict = dict(dataset_dict)\n dataset = Dataset.from_dict(dataset_dict)\n dataset.push_to_hub(\n self.repo_id, # type: ignore\n split=self.split,\n private=self.private,\n token=self.token or os.getenv(\"HF_TOKEN\"),\n )\n yield inputs\n "},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub.process","title":"process(inputs) ","text":"Method that processes the input data, respecting the datasets.Dataset formatting, and pushes it to the Hugging Face Hub based on the RuntimeParameter s attributes. Parameters: Name Type Description Default inputs StepInput that input data within a single object (as it's a GlobalStep) that will be transformed into a datasets.Dataset . required Yields: Type Description StepOutput Propagates the received inputs so that the Distiset can be generated if this is StepOutput the last step of the Pipeline , or if this is not a leaf step and has follow up StepOutput steps. Source code in src/distilabel/steps/globals/huggingface.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n Args:\n inputs: that input data within a single object (as it's a GlobalStep) that\n will be transformed into a `datasets.Dataset`.\n\n Yields:\n Propagates the received inputs so that the `Distiset` can be generated if this is\n the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n steps.\n \"\"\"\n dataset_dict = defaultdict(list)\n for input in inputs:\n for key, value in input.items():\n dataset_dict[key].append(value)\n dataset_dict = dict(dataset_dict)\n dataset = Dataset.from_dict(dataset_dict)\n dataset.push_to_hub(\n self.repo_id, # type: ignore\n split=self.split,\n private=self.private,\n token=self.token or os.getenv(\"HF_TOKEN\"),\n )\n yield inputs\n "},{"location":"api/task/","title":"Task","text":"This section contains the API reference for the distilabel tasks. For more information on how the Task works and see some examples, check the Tutorial - Task page. "},{"location":"api/task/#distilabel.steps.tasks.base","title":"base ","text":""},{"location":"api/task/#distilabel.steps.tasks.base._Task","title":"_Task ","text":" Bases: _Step , ABC _Task is an abstract class that implements the _Step interface and adds the format_input and format_output methods to format the inputs and outputs of the task. It also adds a llm attribute to be used as the LLM to generate the outputs. Attributes: Name Type Description llm LLM the LLM to be used to generate the outputs of the task. group_generations bool whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . add_raw_output RuntimeParameter[bool] whether to include a field with the raw output of the LLM in the distilabel_metadata field of the output. Can be helpful to not loose data with Tasks that need to format the output of the LLM . Defaults to False . num_generations RuntimeParameter[int] The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class _Task(_Step, ABC):\n \"\"\"_Task is an abstract class that implements the `_Step` interface and adds the\n `format_input` and `format_output` methods to format the inputs and outputs of the\n task. It also adds a `llm` attribute to be used as the LLM to generate the outputs.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n add_raw_output: whether to include a field with the raw output of the LLM in the\n `distilabel_metadata` field of the output. Can be helpful to not loose data\n with `Tasks` that need to format the output of the `LLM`. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n llm: LLM\n\n group_generations: bool = False\n add_raw_output: RuntimeParameter[bool] = Field(\n default=True,\n description=(\n \"Whether to include the raw output of the LLM in the key `raw_output_<TASK_NAME>`\"\n \" of the `distilabel_metadata` dictionary output column\"\n ),\n )\n add_raw_input: RuntimeParameter[bool] = Field(\n default=True,\n description=(\n \"Whether to include the raw input of the LLM in the key `raw_input_<TASK_NAME>`\"\n \" of the `distilabel_metadata` dictionary column\"\n ),\n )\n num_generations: RuntimeParameter[int] = Field(\n default=1, description=\"The number of generations to be produced per input.\"\n )\n use_default_structured_output: bool = False\n\n _can_be_used_with_offline_batch_generation: bool = PrivateAttr(False)\n\n def model_post_init(self, __context: Any) -> None:\n if (\n self.llm.use_offline_batch_generation\n and not self._can_be_used_with_offline_batch_generation\n ):\n raise DistilabelUserError(\n f\"`{self.__class__.__name__}` task cannot be used with offline batch generation\"\n \" feature.\",\n page=\"sections/how_to_guides/advanced/offline-batch-generation\",\n )\n\n super().model_post_init(__context)\n\n @property\n def is_global(self) -> bool:\n \"\"\"Extends the `is_global` property to return `True` if the task is using the\n offline batch generation feature, otherwise it returns the value of the parent\n class property. `offline_batch_generation` requires to receive all the inputs\n at once, so for the `_BatchManager` this is a global step.\n\n Returns:\n Whether the task is a global step or not.\n \"\"\"\n if self.llm.use_offline_batch_generation:\n return True\n\n return super().is_global\n\n def load(self) -> None:\n \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n super().load()\n self._set_default_structured_output()\n self.llm.load()\n\n @override\n def unload(self) -> None:\n \"\"\"Unloads the LLM.\"\"\"\n self._logger.debug(\"Executing task unload logic.\")\n self.llm.unload()\n\n @override\n def impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n ) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the outputs of the task in case the LLM failed to generate a response.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n data = self._create_metadata(\n data,\n None,\n None,\n add_raw_output=self.add_raw_output,\n add_raw_input=self.add_raw_input,\n )\n result.append(data)\n return result\n\n @abstractmethod\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n as a string, and generates a Python dictionary with the outputs of the task. In\n addition the `input` used to generate the output is also received just in case it's\n needed to be able to parse the output correctly.\n \"\"\"\n pass\n\n def _format_outputs(\n self,\n outputs: \"GenerateOutput\",\n input: Union[Dict[str, Any], None] = None,\n ) -> List[Dict[str, Any]]:\n \"\"\"Formats the outputs of the task using the `format_output` method. If the output\n is `None` (i.e. the LLM failed to generate a response), then the outputs will be\n set to `None` as well.\n\n Args:\n outputs: The outputs (`n` generations) for the provided `input`.\n input: The input used to generate the output.\n\n Returns:\n A list containing a dictionary with the outputs of the task for each input.\n \"\"\"\n inputs = [None] if input is None else [input]\n formatted_outputs = []\n repeate_inputs = len(outputs.get(\"generations\"))\n outputs = normalize_statistics(outputs)\n\n for (output, stats, extra), input in zip(\n iterate_generations_with_stats(outputs), inputs * repeate_inputs\n ): # type: ignore\n try:\n # Extract the generations, and move the statistics to the distilabel_metadata,\n # to keep everything clean\n formatted_output = self.format_output(output, input)\n formatted_output = self._create_metadata(\n output=formatted_output,\n raw_output=output,\n input=input,\n add_raw_output=self.add_raw_output, # type: ignore\n add_raw_input=self.add_raw_input, # type: ignore\n statistics=stats,\n )\n formatted_output = self._create_extra(\n output=formatted_output, extra=extra\n )\n formatted_outputs.append(formatted_output)\n except Exception as e:\n self._logger.warning( # type: ignore\n f\"Task '{self.name}' failed to format output: {e}. Saving raw response.\" # type: ignore\n )\n formatted_outputs.append(self._output_on_failure(output, input))\n return formatted_outputs\n\n def _output_on_failure(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"In case of failure to format the output, this method will return a dictionary including\n a new field `distilabel_meta` with the raw output of the LLM.\n \"\"\"\n # Create a dictionary with the outputs of the task (every output set to None)\n outputs = {output: None for output in self.outputs}\n outputs[\"model_name\"] = self.llm.model_name # type: ignore\n outputs = self._create_metadata(\n outputs,\n output,\n input,\n add_raw_output=self.add_raw_output, # type: ignore\n add_raw_input=self.add_raw_input, # type: ignore\n )\n return outputs\n\n def _create_metadata(\n self,\n output: Dict[str, Any],\n raw_output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n add_raw_output: bool = True,\n add_raw_input: bool = True,\n statistics: Optional[\"LLMStatistics\"] = None,\n ) -> Dict[str, Any]:\n \"\"\"Adds the raw output and or the formatted input of the LLM to the output dictionary\n if `add_raw_output` is True or `add_raw_input` is True.\n\n Args:\n output:\n The output dictionary after formatting the output from the LLM,\n to add the raw output and or raw input.\n raw_output: The raw output of the `LLM`.\n input: The input used to generate the output.\n add_raw_output: Whether to add the raw output to the output dictionary.\n add_raw_input: Whether to add the raw input to the output dictionary.\n statistics: The statistics generated by the LLM, which should contain at least\n the number of input and output tokens.\n \"\"\"\n meta = output.get(DISTILABEL_METADATA_KEY, {})\n\n if add_raw_output:\n meta[f\"raw_output_{self.name}\"] = raw_output\n\n if add_raw_input:\n meta[f\"raw_input_{self.name}\"] = self.format_input(input) if input else None\n\n if statistics:\n meta[f\"statistics_{self.name}\"] = statistics\n\n if meta:\n output[DISTILABEL_METADATA_KEY] = meta\n\n return output\n\n def _create_extra(\n self, output: Dict[str, Any], extra: Dict[str, Any]\n ) -> Dict[str, Any]:\n column_name_prefix = f\"llm_{self.name}_\"\n for key, value in extra.items():\n column_name = column_name_prefix + key\n output[column_name] = value\n return output\n\n def _set_default_structured_output(self) -> None:\n \"\"\"Prepares the structured output to be set in the selected `LLM`.\n\n If the method `get_structured_output` returns None (the default), there's no need\n to set anything, as it doesn't apply.\n If the `use_default_structured_output` and there's no previous structured output\n set by hand, then decide the type of structured output to select depending on the\n `LLM` provider.\n \"\"\"\n schema = self.get_structured_output()\n if not schema:\n return\n\n if self.use_default_structured_output and not self.llm.structured_output:\n # In case the default structured output is required, we have to set it before\n # the LLM is loaded\n from distilabel.models.llms import InferenceEndpointsLLM\n from distilabel.models.llms.base import AsyncLLM\n\n def check_dependency(module_name: str) -> None:\n if not importlib.util.find_spec(module_name):\n raise ImportError(\n f\"`{module_name}` is not installed and is needed for the structured generation with this LLM.\"\n f\" Please install it using `pip install {module_name}`.\"\n )\n\n dependency = \"outlines\"\n structured_output = {\"schema\": schema}\n if isinstance(self.llm, InferenceEndpointsLLM):\n structured_output.update({\"format\": \"json\"})\n # To determine instructor or outlines format\n elif isinstance(self.llm, AsyncLLM) and not isinstance(\n self.llm, InferenceEndpointsLLM\n ):\n dependency = \"instructor\"\n structured_output.update({\"format\": \"json\"})\n\n check_dependency(dependency)\n self.llm.structured_output = structured_output\n\n def get_structured_output(self) -> Union[Dict[str, Any], None]:\n \"\"\"Returns the structured output for a task that implements one by default,\n must be overriden by subclasses of `Task`. When implemented, should be a json\n schema that enforces the response from the LLM so that it's easier to parse.\n \"\"\"\n return None\n\n def _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {input: f\"<PLACEHOLDER_{input.upper()}>\" for input in self.inputs}\n )\n\n def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n \"\"\"Prints a sample input to the console using the `rich` library.\n Helper method to visualize the prompt of the task.\n\n Args:\n sample_input: A sample input to be printed. If not provided, a default will be\n generated using the `_sample_input` method, which can be overriden by\n subclasses. This should correspond to the same example you could pass to\n the `format_input` method.\n The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n Examples:\n Print the URIAL prompt:\n\n ```python\n from distilabel.steps.tasks import URIAL\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n urial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n )\n urial.load()\n urial.print()\n \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n \u2502 \u2502 # Instruction \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n \u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n \u2502 \u2502 have an engaging tone. \u2502 \u2502\n \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n \u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n \u2502 \u2502 controversial topics. \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # User: \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # Assistant: \u2502 \u2502\n \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n ```\n \"\"\"\n from rich.console import Console, Group\n from rich.panel import Panel\n from rich.text import Text\n\n console = Console()\n sample_input = sample_input or self._sample_input()\n\n panels = []\n for item in sample_input:\n content = Text.assemble((item.get(\"content\", \"\"),))\n panel = Panel(\n content,\n title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n border_style=\"light_cyan3\",\n )\n panels.append(panel)\n\n # Create a group of panels\n # Wrap the group in an outer panel\n outer_panel = Panel(\n Group(*panels),\n title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n border_style=\"light_cyan3\",\n expand=False,\n )\n console.print(outer_panel)\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.is_global","title":"is_global: bool property ","text":"Extends the is_global property to return True if the task is using the offline batch generation feature, otherwise it returns the value of the parent class property. offline_batch_generation requires to receive all the inputs at once, so for the _BatchManager this is a global step. Returns: Type Description bool Whether the task is a global step or not. "},{"location":"api/task/#distilabel.steps.tasks.base._Task.load","title":"load() ","text":"Loads the LLM via the LLM.load() method. Source code in src/distilabel/steps/tasks/base.py def load(self) -> None:\n \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n super().load()\n self._set_default_structured_output()\n self.llm.load()\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.unload","title":"unload() ","text":"Unloads the LLM. Source code in src/distilabel/steps/tasks/base.py @override\ndef unload(self) -> None:\n \"\"\"Unloads the LLM.\"\"\"\n self._logger.debug(\"Executing task unload logic.\")\n self.llm.unload()\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.impute_step_outputs","title":"impute_step_outputs(step_output) ","text":"Imputes the outputs of the task in case the LLM failed to generate a response. Source code in src/distilabel/steps/tasks/base.py @override\ndef impute_step_outputs(\n self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n \"\"\"\n Imputes the outputs of the task in case the LLM failed to generate a response.\n \"\"\"\n result = []\n for row in step_output:\n data = row.copy()\n for output in self.get_outputs().keys():\n data[output] = None\n data = self._create_metadata(\n data,\n None,\n None,\n add_raw_output=self.add_raw_output,\n add_raw_input=self.add_raw_input,\n )\n result.append(data)\n return result\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.format_output","title":"format_output(output, input=None) abstractmethod ","text":"Abstract method to format the outputs of the task. It needs to receive an output as a string, and generates a Python dictionary with the outputs of the task. In addition the input used to generate the output is also received just in case it's needed to be able to parse the output correctly. Source code in src/distilabel/steps/tasks/base.py @abstractmethod\ndef format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n as a string, and generates a Python dictionary with the outputs of the task. In\n addition the `input` used to generate the output is also received just in case it's\n needed to be able to parse the output correctly.\n \"\"\"\n pass\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.get_structured_output","title":"get_structured_output() ","text":"Returns the structured output for a task that implements one by default, must be overriden by subclasses of Task . When implemented, should be a json schema that enforces the response from the LLM so that it's easier to parse. Source code in src/distilabel/steps/tasks/base.py def get_structured_output(self) -> Union[Dict[str, Any], None]:\n \"\"\"Returns the structured output for a task that implements one by default,\n must be overriden by subclasses of `Task`. When implemented, should be a json\n schema that enforces the response from the LLM so that it's easier to parse.\n \"\"\"\n return None\n "},{"location":"api/task/#distilabel.steps.tasks.base._Task.print","title":"print(sample_input=None) ","text":"Prints a sample input to the console using the rich library. Helper method to visualize the prompt of the task. Parameters: Name Type Description Default sample_input Optional[ChatType] A sample input to be printed. If not provided, a default will be generated using the _sample_input method, which can be overriden by subclasses. This should correspond to the same example you could pass to the format_input method. The variables be named by default. None Examples: Print the URIAL prompt: from distilabel.steps.tasks import URIAL\nfrom distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nurial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n)\nurial.load()\nurial.print()\n\u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n\u2502 \u2502 # Instruction \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n\u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n\u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n\u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n\u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n\u2502 \u2502 have an engaging tone. \u2502 \u2502\n\u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n\u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n\u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n\u2502 \u2502 controversial topics. \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 # User: \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n\u2502 \u2502 \u2502 \u2502\n\u2502 \u2502 # Assistant: \u2502 \u2502\n\u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n Source code in src/distilabel/steps/tasks/base.py def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n \"\"\"Prints a sample input to the console using the `rich` library.\n Helper method to visualize the prompt of the task.\n\n Args:\n sample_input: A sample input to be printed. If not provided, a default will be\n generated using the `_sample_input` method, which can be overriden by\n subclasses. This should correspond to the same example you could pass to\n the `format_input` method.\n The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n Examples:\n Print the URIAL prompt:\n\n ```python\n from distilabel.steps.tasks import URIAL\n from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n urial = URIAL(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n )\n urial.load()\n urial.print()\n \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n \u2502 \u2502 # Instruction \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you). \u2502 \u2502\n \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under \"# Assistant:\". \u2502 \u2502\n \u2502 \u2502 You are a helpful, respectful, and honest assistant. \u2502 \u2502\n \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety. \u2502 \u2502\n \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n \u2502 \u2502 have an engaging tone. \u2502 \u2502\n \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic, \u2502 \u2502\n \u2502 \u2502 dangerous, or illegal content, even if it may be helpful. \u2502 \u2502\n \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some \u2502 \u2502\n \u2502 \u2502 controversial topics. \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # User: \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 <PLACEHOLDER_INSTRUCTION> \u2502 \u2502\n \u2502 \u2502 \u2502 \u2502\n \u2502 \u2502 # Assistant: \u2502 \u2502\n \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n ```\n \"\"\"\n from rich.console import Console, Group\n from rich.panel import Panel\n from rich.text import Text\n\n console = Console()\n sample_input = sample_input or self._sample_input()\n\n panels = []\n for item in sample_input:\n content = Text.assemble((item.get(\"content\", \"\"),))\n panel = Panel(\n content,\n title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n border_style=\"light_cyan3\",\n )\n panels.append(panel)\n\n # Create a group of panels\n # Wrap the group in an outer panel\n outer_panel = Panel(\n Group(*panels),\n title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n border_style=\"light_cyan3\",\n expand=False,\n )\n console.print(outer_panel)\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task","title":"Task ","text":" Bases: _Task , Step Task is a class that implements the _Task abstract class and adds the Step interface to be used as a step in the pipeline. Attributes: Name Type Description llm the LLM to be used to generate the outputs of the task. group_generations whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . num_generations The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class Task(_Task, Step):\n \"\"\"Task is a class that implements the `_Task` abstract class and adds the `Step`\n interface to be used as a step in the pipeline.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n @abstractmethod\n def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n pass\n\n def _format_inputs(self, inputs: List[Dict[str, Any]]) -> List[\"FormattedInput\"]:\n \"\"\"Formats the inputs of the task using the `format_input` method.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list containing the formatted inputs, which are `ChatType`-like following\n the OpenAI formatting.\n \"\"\"\n return [self.format_input(input) for input in inputs]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n formatted_inputs = self._format_inputs(inputs)\n\n # `outputs` is a dict containing the LLM outputs in the `generations`\n # key and the statistics in the `statistics` key\n outputs = self.llm.generate_outputs(\n inputs=formatted_inputs,\n num_generations=self.num_generations, # type: ignore\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n task_outputs = []\n for input, input_outputs in zip(inputs, outputs):\n formatted_outputs = self._format_outputs(input_outputs, input)\n\n if self.group_generations:\n combined = group_dicts(*formatted_outputs)\n task_outputs.append(\n {**input, **combined, \"model_name\": self.llm.model_name}\n )\n continue\n\n # Create a row per generation\n for formatted_output in formatted_outputs:\n task_outputs.append(\n {**input, **formatted_output, \"model_name\": self.llm.model_name}\n )\n\n yield task_outputs\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task.format_input","title":"format_input(input) abstractmethod ","text":"Abstract method to format the inputs of the task. It needs to receive an input as a Python dictionary, and generates an OpenAI chat-like list of dicts. Source code in src/distilabel/steps/tasks/base.py @abstractmethod\ndef format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n pass\n "},{"location":"api/task/#distilabel.steps.tasks.base.Task.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/base.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n formatted_inputs = self._format_inputs(inputs)\n\n # `outputs` is a dict containing the LLM outputs in the `generations`\n # key and the statistics in the `statistics` key\n outputs = self.llm.generate_outputs(\n inputs=formatted_inputs,\n num_generations=self.num_generations, # type: ignore\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n task_outputs = []\n for input, input_outputs in zip(inputs, outputs):\n formatted_outputs = self._format_outputs(input_outputs, input)\n\n if self.group_generations:\n combined = group_dicts(*formatted_outputs)\n task_outputs.append(\n {**input, **combined, \"model_name\": self.llm.model_name}\n )\n continue\n\n # Create a row per generation\n for formatted_output in formatted_outputs:\n task_outputs.append(\n {**input, **formatted_output, \"model_name\": self.llm.model_name}\n )\n\n yield task_outputs\n "},{"location":"api/task/generator_task/","title":"GeneratorTask","text":"This section contains the API reference for the distilabel generator tasks. For more information on how the GeneratorTask works and see some examples, check the Tutorial - Task - GeneratorTask page. "},{"location":"api/task/generator_task/#distilabel.steps.tasks.base.GeneratorTask","title":"GeneratorTask ","text":" Bases: _Task , GeneratorStep GeneratorTask is a class that implements the _Task abstract class and adds the GeneratorStep interface to be used as a step in the pipeline. Attributes: Name Type Description llm the LLM to be used to generate the outputs of the task. group_generations whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False . num_generations The number of generations to be produced per input. Source code in src/distilabel/steps/tasks/base.py class GeneratorTask(_Task, GeneratorStep):\n \"\"\"`GeneratorTask` is a class that implements the `_Task` abstract class and adds the\n `GeneratorStep` interface to be used as a step in the pipeline.\n\n Attributes:\n llm: the `LLM` to be used to generate the outputs of the task.\n group_generations: whether to group the `num_generations` generated per input in\n a list or create a row per generation. Defaults to `False`.\n num_generations: The number of generations to be produced per input.\n \"\"\"\n\n pass\n "},{"location":"api/task/task_gallery/","title":"Task Gallery","text":"This section contains the existing Task subclasses implemented in distilabel . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks","title":"tasks ","text":""},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker","title":"APIGenExecutionChecker ","text":" Bases: Step Executes the generated function calls. This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath , which is a string pointing to a python .py file with functions). Attributes: Name Type Description libpath str The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename. check_is_dangerous bool Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True. Input columns - answers (
str ): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads . Output columns - keep_row_after_execution_check (
bool ): Whether the function should be kept or not. - execution_result (
str ): The result from executing the function. Categories References - APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
- Salesforce/xlam-function-calling-60k
Examples: Execute a function from a given library with the answer from an LLM: from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n Source code in src/distilabel/steps/tasks/apigen/execution_checker.py class APIGenExecutionChecker(Step):\n \"\"\"Executes the generated function calls.\n\n This step checks if a given answer from a model as generated by `APIGenGenerator`\n can be executed against the given library (given by `libpath`, which is a string\n pointing to a python .py file with functions).\n\n Attributes:\n libpath: The path to the library where we will retrieve the functions.\n It can also point to a folder with the functions. In this case, the folder\n layout should be a folder with .py files, each containing a single function,\n the name of the function being the same as the filename.\n check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains\n some heuristics found while testing. This functions can run subprocesses, deal with\n the OS, or have other potentially dangerous operations. Defaults to True.\n\n Input columns:\n - answers (`str`): List with arguments to be passed to the function,\n dumped as a string from a list of dictionaries. Should be loaded using\n `json.loads`.\n\n Output columns:\n - keep_row_after_execution_check (`bool`): Whether the function should be kept or not.\n - execution_result (`str`): The result from executing the function.\n\n Categories:\n - filtering\n - execution\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n Execute a function from a given library with the answer from an LLM:\n\n ```python\n from distilabel.steps.tasks import APIGenExecutionChecker\n\n # For the libpath you can use as an example the file at the tests folder:\n # ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\n task = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n )\n task.load()\n\n res = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n )\n res\n #[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n ```\n \"\"\"\n\n libpath: str = Field(\n default=...,\n description=(\n \"The path to the library where we will retrieve the functions, \"\n \"or a folder with python files named the same as the functions they contain.\",\n ),\n )\n check_is_dangerous: bool = Field(\n default=True,\n description=(\n \"Bool to exclude some potentially dangerous functions, it contains \"\n \"some heuristics found while testing. This functions can run subprocesses, \"\n \"deal with the OS, or have other potentially dangerous operations.\",\n ),\n )\n\n _toolbox: Union[\"ModuleType\", None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n super().load()\n if Path(self.libpath).suffix == \".py\":\n self._toolbox = load_module_from_path(self.libpath)\n\n def unload(self) -> None:\n self._toolbox = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task are those found in the original dataset.\"\"\"\n return [\"answers\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs are the columns required by `APIGenGenerator` task.\"\"\"\n return [\"keep_row_after_execution_check\", \"execution_result\"]\n\n def _get_function(self, function_name: str) -> Callable:\n \"\"\"Retrieves the function from the toolbox.\n\n Args:\n function_name: The name of the function to retrieve.\n\n Returns:\n Callable: The function to be executed.\n \"\"\"\n if self._toolbox:\n return getattr(self._toolbox, function_name, None)\n try:\n toolbox = load_module_from_path(\n str(Path(self.libpath) / f\"{function_name}.py\")\n )\n return getattr(toolbox, function_name, None)\n except FileNotFoundError:\n return None\n except Exception as e:\n self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n return None\n\n def _is_dangerous(self, function: Callable) -> bool:\n \"\"\"Checks if a function is dangerous to remove it.\n Contains a list of heuristics to avoid executing possibly dangerous functions.\n \"\"\"\n source_code = inspect.getsource(function)\n # We don't want to execute functions that use subprocess\n if (\n (\"subprocess.\" in source_code)\n or (\"os.system(\" in source_code)\n or (\"input(\" in source_code)\n # Avoiding threading\n or (\"threading.Thread(\" in source_code)\n or (\"exec(\" in source_code)\n # Avoiding argparse (not sure why)\n or (\"argparse.ArgumentParser(\" in source_code)\n # Avoiding logging changing the levels to not mess with the logs\n or (\".setLevel(\" in source_code)\n # Don't run a test battery\n or (\"unittest.main(\" in source_code)\n # Avoid exiting the program\n or (\"sys.exit(\" in source_code)\n or (\"exit(\" in source_code)\n or (\"raise SystemExit(\" in source_code)\n or (\"multiprocessing.Pool(\" in source_code)\n ):\n return True\n return False\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Checks the answer to see if it can be executed.\n Captures the possible errors and returns them.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n output = []\n if input[\"answers\"]:\n answers = json.loads(input[\"answers\"])\n else:\n input.update(\n **{\n \"keep_row_after_execution_check\": False,\n \"execution_result\": [\"No answers were provided.\"],\n }\n )\n continue\n for answer in answers:\n if answer is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": \"Nothing was generated for this answer.\",\n }\n )\n continue\n\n function_name = answer.get(\"name\", None)\n arguments = answer.get(\"arguments\", None)\n\n self._logger.debug(\n f\"Executing function '{function_name}' with arguments: {arguments}\"\n )\n function = self._get_function(function_name)\n\n if self.check_is_dangerous:\n if function and self._is_dangerous(function):\n function = None\n\n if function is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": f\"Function '{function_name}' not found.\",\n }\n )\n else:\n execution = execute_from_response(function, arguments)\n output.append(\n {\n \"keep\": execution[\"keep\"],\n \"execution_result\": execution[\"execution_result\"],\n }\n )\n # We only consider a good response if all the answers were executed successfully,\n # but keep the reasons for further review if needed.\n input.update(\n **{\n \"keep_row_after_execution_check\": all(\n o[\"keep\"] is True for o in output\n ),\n \"execution_result\": [o[\"execution_result\"] for o in output],\n }\n )\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task are those found in the original dataset. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.outputs","title":"outputs: StepColumns property ","text":"The outputs are the columns required by APIGenGenerator task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.load","title":"load() ","text":"Loads the library where the functions will be extracted from. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def load(self) -> None:\n \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n super().load()\n if Path(self.libpath).suffix == \".py\":\n self._toolbox = load_module_from_path(self.libpath)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._get_function","title":"_get_function(function_name) ","text":"Retrieves the function from the toolbox. Parameters: Name Type Description Default function_name str The name of the function to retrieve. required Returns: Name Type Description Callable Callable The function to be executed. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def _get_function(self, function_name: str) -> Callable:\n \"\"\"Retrieves the function from the toolbox.\n\n Args:\n function_name: The name of the function to retrieve.\n\n Returns:\n Callable: The function to be executed.\n \"\"\"\n if self._toolbox:\n return getattr(self._toolbox, function_name, None)\n try:\n toolbox = load_module_from_path(\n str(Path(self.libpath) / f\"{function_name}.py\")\n )\n return getattr(toolbox, function_name, None)\n except FileNotFoundError:\n return None\n except Exception as e:\n self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n return None\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._is_dangerous","title":"_is_dangerous(function) ","text":"Checks if a function is dangerous to remove it. Contains a list of heuristics to avoid executing possibly dangerous functions. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py def _is_dangerous(self, function: Callable) -> bool:\n \"\"\"Checks if a function is dangerous to remove it.\n Contains a list of heuristics to avoid executing possibly dangerous functions.\n \"\"\"\n source_code = inspect.getsource(function)\n # We don't want to execute functions that use subprocess\n if (\n (\"subprocess.\" in source_code)\n or (\"os.system(\" in source_code)\n or (\"input(\" in source_code)\n # Avoiding threading\n or (\"threading.Thread(\" in source_code)\n or (\"exec(\" in source_code)\n # Avoiding argparse (not sure why)\n or (\"argparse.ArgumentParser(\" in source_code)\n # Avoiding logging changing the levels to not mess with the logs\n or (\".setLevel(\" in source_code)\n # Don't run a test battery\n or (\"unittest.main(\" in source_code)\n # Avoid exiting the program\n or (\"sys.exit(\" in source_code)\n or (\"exit(\" in source_code)\n or (\"raise SystemExit(\" in source_code)\n or (\"multiprocessing.Pool(\" in source_code)\n ):\n return True\n return False\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.process","title":"process(inputs) ","text":"Checks the answer to see if it can be executed. Captures the possible errors and returns them. If a single example is provided, it is copied to avoid raising an error. Parameters: Name Type Description Default inputs StepInput A list of dictionaries with the input data. required Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/tasks/apigen/execution_checker.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Checks the answer to see if it can be executed.\n Captures the possible errors and returns them.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n output = []\n if input[\"answers\"]:\n answers = json.loads(input[\"answers\"])\n else:\n input.update(\n **{\n \"keep_row_after_execution_check\": False,\n \"execution_result\": [\"No answers were provided.\"],\n }\n )\n continue\n for answer in answers:\n if answer is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": \"Nothing was generated for this answer.\",\n }\n )\n continue\n\n function_name = answer.get(\"name\", None)\n arguments = answer.get(\"arguments\", None)\n\n self._logger.debug(\n f\"Executing function '{function_name}' with arguments: {arguments}\"\n )\n function = self._get_function(function_name)\n\n if self.check_is_dangerous:\n if function and self._is_dangerous(function):\n function = None\n\n if function is None:\n output.append(\n {\n \"keep\": False,\n \"execution_result\": f\"Function '{function_name}' not found.\",\n }\n )\n else:\n execution = execute_from_response(function, arguments)\n output.append(\n {\n \"keep\": execution[\"keep\"],\n \"execution_result\": execution[\"execution_result\"],\n }\n )\n # We only consider a good response if all the answers were executed successfully,\n # but keep the reasons for further review if needed.\n input.update(\n **{\n \"keep_row_after_execution_check\": all(\n o[\"keep\"] is True for o in output\n ),\n \"execution_result\": [o[\"execution_result\"] for o in output],\n }\n )\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator","title":"APIGenGenerator ","text":" Bases: Task Generate queries and answers for the given functions in JSON format. The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\nverifiable and diverse function-calling datasets. The task generates a set of diverse queries\nand corresponding answers for the given functions in JSON format.\n\nAttributes:\n system_prompt: The system prompt to guide the user in the generation of queries and answers.\n use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n In case the tools are given in the input, they will be added to the prompt.\n number: The number of queries to generate. It can be a list, where each number will be\n chosen randomly, or a dictionary with the number of queries and the probability of each.\n I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n It corresponds to the number of parallel queries to generate.\n use_default_structured_output: Whether to use the default structured output or not.\n\nInput columns:\n - examples (`str`): Examples used as few shots to guide the model.\n - func_name (`str`): Name for the function to generate.\n - func_desc (`str`): Description of what the function should do.\n - tools (`str`): JSON formatted string containing the tool representation of the function.\n\nOutput columns:\n - query (`str`): The list of queries.\n - answers (`str`): JSON formatted string with the list of answers, containing the info as\n a dictionary to be passed to the functions.\n\nCategories:\n - text-generation\n\nReferences:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\nExamples:\n Generate without structured output (original implementation):\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n )\n apigen.load()\n\n res = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\n What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?', # 'Give me 5 random movie suggestions from your database to plan my weekend.'], # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}], # [{'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format. Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date. Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}] Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API. The output MUST strictly adhere to the following JSON format, and NO other text MUST be included: [\n {\n \"query\": \"The generated query.\",\n \"answers\": [\n {\n \"name\": \"api_name\",\n \"arguments\": {\n \"arg_name\": \"value\"\n ... (more arguments as required)\n }\n },\n ... (more API calls as required)\n ]\n }\n]\n Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` Generate with structured output:\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n )\n apigen.load()\n\n res_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\n What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res_struct # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\", # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"], # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}], # [{'arguments': {}, 'name': 'getrandommovie'}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format. Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date. Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}] Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API. Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ``` Source code in src/distilabel/steps/tasks/apigen/generator.py class APIGenGenerator(Task):\n \"\"\"Generate queries and answers for the given functions in JSON format.\n\n The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n and corresponding answers for the given functions in JSON format.\n\n Attributes:\n system_prompt: The system prompt to guide the user in the generation of queries and answers.\n use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n In case the tools are given in the input, they will be added to the prompt.\n number: The number of queries to generate. It can be a list, where each number will be\n chosen randomly, or a dictionary with the number of queries and the probability of each.\n I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n It corresponds to the number of parallel queries to generate.\n use_default_structured_output: Whether to use the default structured output or not.\n\n Input columns:\n - examples (`str`): Examples used as few shots to guide the model.\n - func_name (`str`): Name for the function to generate.\n - func_desc (`str`): Description of what the function should do.\n - tools (`str`): JSON formatted string containing the tool representation of the function.\n\n Output columns:\n - query (`str`): The list of queries.\n - answers (`str`): JSON formatted string with the list of answers, containing the info as\n a dictionary to be passed to the functions.\n\n Categories:\n - text-generation\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n Generate without structured output (original implementation):\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n )\n apigen.load()\n\n res = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n )\n res\n # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n # 'number': 1,\n # 'func_name': 'getrandommovie',\n # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n # 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n # [{'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}},\n # {'name': 'getrandommovie', 'arguments': {}}]],\n # 'raw_input_api_gen_generator_0': [{'role': 'system',\n # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n # {'role': 'user',\n # 'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n {\\n \"query\": \"The generated query.\",\\n \"answers\": [\\n {\\n \"name\": \"api_name\",\\n \"arguments\": {\\n \"arg_name\": \"value\"\\n ... (more arguments as required)\\n }\\n },\\n ... (more API calls as required)\\n ]\\n }\\n]\\n```\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Generate with structured output:\n\n ```python\n from distilabel.steps.tasks import ApiGenGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n apigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n )\n apigen.load()\n\n res_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n )\n res_struct\n # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n # 'number': 1,\n # 'func_name': 'getrandommovie',\n # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n # [{'arguments': {}, 'name': 'getrandommovie'}]],\n # 'raw_input_api_gen_generator_0': [{'role': 'system',\n # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n # {'role': 'user',\n # 'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT_API_GEN\n use_default_structured_output: bool = False\n number: Union[int, List[int], Dict[int, float]] = 1\n use_tools: bool = True\n\n _number: Union[int, None] = PrivateAttr(None)\n _fn_parallel_queries: Union[Callable[[], str], None] = PrivateAttr(None)\n _format_inst: Union[str, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"generator.jinja2\"\n )\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n\n def _parallel_queries(self, number: int) -> Callable[[int], str]:\n \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n Raises:\n ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n Returns:\n The function to generate the parallel queries guide.\n \"\"\"\n if number > 1:\n return (\n \"It can contain multiple parallel queries in natural language for the given functions. \"\n \"They could use either the same function with different arguments or different functions.\\n\"\n )\n return \"\"\n\n def _get_number(self) -> int:\n \"\"\"Generates the number of queries to generate in a single call.\n The number must be set to `_number` to avoid changing the original value\n when calling `_default_error`.\n \"\"\"\n if isinstance(self.number, list):\n self._number = random.choice(self.number)\n elif isinstance(self.number, dict):\n self._number = random.choices(\n list(self.number.keys()), list(self.number.values())\n )[0]\n else:\n self._number = self.number\n return self._number\n\n def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n \"```\\n\"\n \"[\\n\"\n \" {\\n\"\n ' \"query\": \"The generated query.\",\\n'\n ' \"answers\": [\\n'\n \" {\\n\"\n ' \"name\": \"api_name\",\\n'\n ' \"arguments\": {\\n'\n ' \"arg_name\": \"value\"\\n'\n \" ... (more arguments as required)\\n\"\n \" }\\n\"\n \" },\\n\"\n \" ... (more API calls as required)\\n\"\n \" ]\\n\"\n \" }\\n\"\n \"]\\n\"\n \"```\\n\"\n )\n\n def _get_func_desc(self, input: Dict[str, Any]) -> str:\n \"\"\"If available and required, will use the info from the tools in the\n prompt for extra information. Otherwise will use jut the function description.\n \"\"\"\n if not self.use_tools:\n return input[\"func_desc\"]\n extra = \"\" # Extra information from the tools (if available will be added)\n if \"tools\" in input:\n extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n return input[\"func_desc\"] + extra\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task.\"\"\"\n return {\n \"examples\": True,\n \"func_name\": True,\n \"func_desc\": True,\n \"tools\": False,\n }\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n number = self._get_number()\n parallel_queries = self._parallel_queries(number)\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n examples=input[\"examples\"],\n parallel_queries=parallel_queries,\n number=number,\n func_name=input[\"func_name\"],\n func_desc=self._get_func_desc(input),\n format_inst=self._format_inst,\n ),\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n return [\"query\", \"answers\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n if not self.use_default_structured_output:\n output = remove_fences(output)\n\n try:\n pairs = orjson.loads(output)\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n return self._format_output(pairs, input)\n\n def _format_output(\n self, pairs: Dict[str, Any], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n Args:\n pairs: The parsed dictionary from the LLM's output.\n input: The input from the `LLM`.\n\n Returns:\n Formatted output, where the `queries` are a list of strings, and the `answers`\n are a list of objects.\n \"\"\"\n try:\n input.update(\n **{\n \"query\": pairs[0][\"query\"],\n \"answers\": json.dumps(pairs[0][\"answers\"]),\n }\n )\n return input\n except Exception as e:\n self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n return self._default_error(input)\n\n def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n input.update(\n **{\n \"query\": None,\n \"answers\": json.dumps([None] * self._number),\n }\n )\n return input\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Dict, List\n from pydantic import BaseModel\n\n\n class Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\n class QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\n class QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\n json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Answer\": {\n \"properties\": {\n \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n \"arguments\": {\n \"additionalProperties\": {\"type\": \"string\"},\n \"title\": \"Arguments\",\n \"type\": \"object\",\n },\n },\n \"required\": [\"name\", \"arguments\"],\n \"title\": \"Answer\",\n \"type\": \"object\",\n },\n \"QueryAnswer\": {\n \"properties\": {\n \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n \"answers\": {\n \"items\": {\"$ref\": \"#/$defs/Answer\"},\n \"title\": \"Answers\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"query\", \"answers\"],\n \"title\": \"QueryAnswer\",\n \"type\": \"object\",\n },\n },\n \"properties\": {\n \"pairs\": {\n \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n \"title\": \"Pairs\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"pairs\"],\n \"title\": \"QueryAnswerPairs\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.outputs","title":"outputs: StepColumns property ","text":"The output for the task are the queries and corresponding answers. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.load","title":"load() ","text":"Loads the template for the generator prompt. Source code in src/distilabel/steps/tasks/apigen/generator.py def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"generator.jinja2\"\n )\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._parallel_queries","title":"_parallel_queries(number) ","text":"Prepares the function to update the parallel queries guide in the prompt. Raises: Type Description ValueError if is_parallel is not a boolean or a list of floats. Returns: Type Description Callable[[int], str] The function to generate the parallel queries guide. Source code in src/distilabel/steps/tasks/apigen/generator.py def _parallel_queries(self, number: int) -> Callable[[int], str]:\n \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n Raises:\n ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n Returns:\n The function to generate the parallel queries guide.\n \"\"\"\n if number > 1:\n return (\n \"It can contain multiple parallel queries in natural language for the given functions. \"\n \"They could use either the same function with different arguments or different functions.\\n\"\n )\n return \"\"\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_number","title":"_get_number() ","text":"Generates the number of queries to generate in a single call. The number must be set to _number to avoid changing the original value when calling _default_error . Source code in src/distilabel/steps/tasks/apigen/generator.py def _get_number(self) -> int:\n \"\"\"Generates the number of queries to generate in a single call.\n The number must be set to `_number` to avoid changing the original value\n when calling `_default_error`.\n \"\"\"\n if isinstance(self.number, list):\n self._number = random.choice(self.number)\n elif isinstance(self.number, dict):\n self._number = random.choices(\n list(self.number.keys()), list(self.number.values())\n )[0]\n else:\n self._number = self.number\n return self._number\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._set_format_inst","title":"_set_format_inst() ","text":"Prepares the function to generate the formatted instructions for the prompt. If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON. Source code in src/distilabel/steps/tasks/apigen/generator.py def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n \"```\\n\"\n \"[\\n\"\n \" {\\n\"\n ' \"query\": \"The generated query.\",\\n'\n ' \"answers\": [\\n'\n \" {\\n\"\n ' \"name\": \"api_name\",\\n'\n ' \"arguments\": {\\n'\n ' \"arg_name\": \"value\"\\n'\n \" ... (more arguments as required)\\n\"\n \" }\\n\"\n \" },\\n\"\n \" ... (more API calls as required)\\n\"\n \" ]\\n\"\n \" }\\n\"\n \"]\\n\"\n \"```\\n\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_func_desc","title":"_get_func_desc(input) ","text":"If available and required, will use the info from the tools in the prompt for extra information. Otherwise will use jut the function description. Source code in src/distilabel/steps/tasks/apigen/generator.py def _get_func_desc(self, input: Dict[str, Any]) -> str:\n \"\"\"If available and required, will use the info from the tools in the\n prompt for extra information. Otherwise will use jut the function description.\n \"\"\"\n if not self.use_tools:\n return input[\"func_desc\"]\n extra = \"\" # Extra information from the tools (if available will be added)\n if \"tools\" in input:\n extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n return input[\"func_desc\"] + extra\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType . Source code in src/distilabel/steps/tasks/apigen/generator.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n number = self._get_number()\n parallel_queries = self._parallel_queries(number)\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n examples=input[\"examples\"],\n parallel_queries=parallel_queries,\n number=number,\n func_name=input[\"func_name\"],\n func_desc=self._get_func_desc(input),\n format_inst=self._format_inst,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the queries and answers pairs. Dict[str, Any] The answers are an array of answers corresponding to the query. Dict[str, Any] Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer. Dict[str, Any] Each argument is represented as a key-value pair, where the key is the parameter name and the Dict[str, Any] value is the corresponding value. Source code in src/distilabel/steps/tasks/apigen/generator.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n if not self.use_default_structured_output:\n output = remove_fences(output)\n\n try:\n pairs = orjson.loads(output)\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n return self._format_output(pairs, input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._format_output","title":"_format_output(pairs, input) ","text":"Parses the response, returning a dictionary with queries and answers. Parameters: Name Type Description Default pairs Dict[str, Any] The parsed dictionary from the LLM's output. required input Dict[str, Any] The input from the LLM . required Returns: Type Description Dict[str, Any] Formatted output, where the queries are a list of strings, and the answers Dict[str, Any] are a list of objects. Source code in src/distilabel/steps/tasks/apigen/generator.py def _format_output(\n self, pairs: Dict[str, Any], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n Args:\n pairs: The parsed dictionary from the LLM's output.\n input: The input from the `LLM`.\n\n Returns:\n Formatted output, where the `queries` are a list of strings, and the `answers`\n are a list of objects.\n \"\"\"\n try:\n input.update(\n **{\n \"query\": pairs[0][\"query\"],\n \"answers\": json.dumps(pairs[0][\"answers\"]),\n }\n )\n return input\n except Exception as e:\n self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n return self._default_error(input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._default_error","title":"_default_error(input) ","text":"Returns a default error output, to fill the responses in case of failure. Source code in src/distilabel/steps/tasks/apigen/generator.py def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n input.update(\n **{\n \"query\": None,\n \"answers\": json.dumps([None] * self._number),\n }\n )\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from typing import Dict, List\nfrom pydantic import BaseModel\n\n\nclass Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\nclass QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\nclass QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\njson.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/apigen/generator.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Dict, List\n from pydantic import BaseModel\n\n\n class Answer(BaseModel):\n name: str\n arguments: Dict[str, str]\n\n class QueryAnswer(BaseModel):\n query: str\n answers: List[Answer]\n\n class QueryAnswerPairs(BaseModel):\n pairs: List[QueryAnswer]\n\n json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Answer\": {\n \"properties\": {\n \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n \"arguments\": {\n \"additionalProperties\": {\"type\": \"string\"},\n \"title\": \"Arguments\",\n \"type\": \"object\",\n },\n },\n \"required\": [\"name\", \"arguments\"],\n \"title\": \"Answer\",\n \"type\": \"object\",\n },\n \"QueryAnswer\": {\n \"properties\": {\n \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n \"answers\": {\n \"items\": {\"$ref\": \"#/$defs/Answer\"},\n \"title\": \"Answers\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"query\", \"answers\"],\n \"title\": \"QueryAnswer\",\n \"type\": \"object\",\n },\n },\n \"properties\": {\n \"pairs\": {\n \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n \"title\": \"Pairs\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"pairs\"],\n \"title\": \"QueryAnswerPairs\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker","title":"APIGenSemanticChecker ","text":" Bases: Task Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. Attributes: Name Type Description system_prompt str System prompt for the task. Has a default one. exclude_failed_execution str Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker ). Defaults to True. Input columns - func_desc (
str ): Description of what the function should do. - query (
str ): Instruction from the user. - answers (
str ): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads . - execution_result (
str ): Result of the function/API executed. Output columns - thought (
str ): Reasoning for the output on whether to keep this output or not. - keep_row_after_semantic_check (
bool ): True or False, can be used to filter afterwards. Categories - filtering
- text-generation
References - APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
- Salesforce/xlam-function-calling-60k
Examples: Semantic checker for generated function calls (original implementation):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n\nSemantic checker for generated function calls (structured output):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py class APIGenSemanticChecker(Task):\n r\"\"\"Generate queries and answers for the given functions in JSON format.\n\n The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n and corresponding answers for the given functions in JSON format.\n\n Attributes:\n system_prompt: System prompt for the task. Has a default one.\n exclude_failed_execution: Whether to exclude failed executions (won't run on those\n rows that have a False in `keep_row_after_execution_check` column, which\n comes from running `APIGenExecutionChecker`). Defaults to True.\n\n Input columns:\n - func_desc (`str`): Description of what the function should do.\n - query (`str`): Instruction from the user.\n - answers (`str`): JSON encoded list with arguments to be passed to the function/API.\n Should be loaded using `json.loads`.\n - execution_result (`str`): Result of the function/API executed.\n\n Output columns:\n - thought (`str`): Reasoning for the output on whether to keep this output or not.\n - keep_row_after_semantic_check (`bool`): True or False, can be used to filter\n afterwards.\n\n Categories:\n - filtering\n - text-generation\n\n References:\n - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n Examples:\n\n Semantic checker for generated function calls (original implementation):\n\n ```python\n from distilabel.steps.tasks import APIGenSemanticChecker\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n semantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n )\n semantic_checker.load()\n\n res = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n )\n res\n # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n # 'thought': '',\n # 'keep_row_after_semantic_check': True,\n # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n # 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n # {'role': 'user',\n # 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Semantic checker for generated function calls (structured output):\n\n ```python\n from distilabel.steps.tasks import APIGenSemanticChecker\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n )\n semantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n )\n semantic_checker.load()\n\n res = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n )\n res\n # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n # 'keep_row_after_semantic_check': True,\n # 'thought': '',\n # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n # 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n # {'role': 'user',\n # 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT_SEMANTIC_CHECKER\n use_default_structured_output: bool = False\n\n _format_inst: Union[str, None] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"semantic_checker.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n\n def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n \"```\\n\"\n \"{\\n\"\n ' \"thought\": \"Concisely describe your reasoning here\",\\n'\n ' \"passes\": \"yes\" or \"no\"\\n'\n \"}\\n\"\n \"```\\n\"\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task.\"\"\"\n return {\n \"func_desc\": True,\n \"query\": True,\n \"answers\": True,\n \"execution_result\": True,\n \"keep_row_after_execution_check\": True,\n }\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n func_desc=input[\"func_desc\"],\n query=input[\"query\"] or \"\",\n func_call=input[\"answers\"] or \"\",\n execution_result=input[\"execution_result\"],\n format_inst=self._format_inst,\n ),\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n return [\"keep_row_after_semantic_check\", \"thought\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n output = remove_fences(output)\n\n try:\n result = orjson.loads(output)\n # Update the column name and change to bool\n result[\"keep_row_after_semantic_check\"] = (\n result.pop(\"passes\").lower() == \"yes\"\n )\n input.update(**result)\n return input\n except orjson.JSONDecodeError:\n return self._default_error(input)\n\n def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Default error message for the task.\"\"\"\n input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n return input\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Literal\n from pydantic import BaseModel\n import json\n\n class Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\n json.dumps(Checker.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n },\n \"required\": [\"thought\", \"passes\"],\n \"title\": \"Checker\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.outputs","title":"outputs: StepColumns property ","text":"The output for the task are the queries and corresponding answers. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.load","title":"load() ","text":"Loads the template for the generator prompt. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def load(self) -> None:\n \"\"\"Loads the template for the generator prompt.\"\"\"\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"apigen\"\n / \"semantic_checker.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n self._format_inst = self._set_format_inst()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._set_format_inst","title":"_set_format_inst() ","text":"Prepares the function to generate the formatted instructions for the prompt. If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def _set_format_inst(self) -> str:\n \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n If the default structured output is used, returns an empty string because nothing\n else is needed, otherwise, returns the original addition to the prompt to guide the model\n to generate a formatted JSON.\n \"\"\"\n return (\n \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n \"```\\n\"\n \"{\\n\"\n ' \"thought\": \"Concisely describe your reasoning here\",\\n'\n ' \"passes\": \"yes\" or \"no\"\\n'\n \"}\\n\"\n \"```\\n\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType . Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType`.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n func_desc=input[\"func_desc\"],\n query=input[\"query\"] or \"\",\n func_call=input[\"answers\"] or \"\",\n execution_result=input[\"execution_result\"],\n format_inst=self._format_inst,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the queries and answers pairs. Dict[str, Any] The answers are an array of answers corresponding to the query. Dict[str, Any] Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer. Dict[str, Any] Each argument is represented as a key-value pair, where the key is the parameter name and the Dict[str, Any] value is the corresponding value. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the queries and answers pairs.\n The answers are an array of answers corresponding to the query.\n Each answer is represented as an object with the following properties:\n - name (string): The name of the tool used to generate the answer.\n - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n Each argument is represented as a key-value pair, where the key is the parameter name and the\n value is the corresponding value.\n \"\"\"\n if output is None:\n return self._default_error(input)\n\n output = remove_fences(output)\n\n try:\n result = orjson.loads(output)\n # Update the column name and change to bool\n result[\"keep_row_after_semantic_check\"] = (\n result.pop(\"passes\").lower() == \"yes\"\n )\n input.update(**result)\n return input\n except orjson.JSONDecodeError:\n return self._default_error(input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._default_error","title":"_default_error(input) ","text":"Default error message for the task. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Default error message for the task.\"\"\"\n input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from typing import Literal\nfrom pydantic import BaseModel\nimport json\n\nclass Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\njson.dumps(Checker.model_json_schema(), indent=4)\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from typing import Literal\n from pydantic import BaseModel\n import json\n\n class Checker(BaseModel):\n thought: str\n passes: Literal[\"yes\", \"no\"]\n\n json.dumps(Checker.model_json_schema(), indent=4)\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n },\n \"required\": [\"thought\", \"passes\"],\n \"title\": \"Checker\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller","title":"ArgillaLabeller ","text":" Bases: Task Annotate Argilla records based on input fields, example records and question settings. This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - record (
argilla.Record ): The record to be annotated. - fields (
Optional[List[Dict[str, Any]]] ): The list of field settings for the input fields. - question (
Optional[Dict[str, Any]] ): The question settings for the question to be answered. - example_records (
Optional[List[Dict[str, Any]]] ): The few shot example records with responses to be used to answer the question. - guidelines (
Optional[str] ): The guidelines for the annotation task. Output columns - suggestion (
Dict[str, Any] ): The final suggestion for annotation. Categories - text-classification
- scorer
- text-generation
References Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets Examples: Annotate a record with the same dataset and question: import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n)\nexample_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n Annotate a record with alternating datasets and questions: import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n Overwrite default prompts and instructions: import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n)\nlabeller.load()\n Source code in src/distilabel/steps/tasks/argilla_labeller.py class ArgillaLabeller(Task):\n \"\"\"\n Annotate Argilla records based on input fields, example records and question settings.\n\n This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM.\n It uses a system prompt that guides the LLM to understand the input fields, the question type,\n and the question settings. The task then formats the input data and generates a response based on the question.\n The response is validated against the question's value model, and the final suggestion is prepared for annotation.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - record (`argilla.Record`): The record to be annotated.\n - fields (`Optional[List[Dict[str, Any]]]`): The list of field settings for the input fields.\n - question (`Optional[Dict[str, Any]]`): The question settings for the question to be answered.\n - example_records (`Optional[List[Dict[str, Any]]]`): The few shot example records with responses to be used to answer the question.\n - guidelines (`Optional[str]`): The guidelines for the annotation task.\n\n Output columns:\n - suggestion (`Dict[str, Any]`): The final suggestion for annotation.\n\n Categories:\n - text-classification\n - scorer\n - text-generation\n\n References:\n - [`Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets`](https://github.com/argilla-io/argilla/)\n\n Examples:\n Annotate a record with the same dataset and question:\n\n ```python\n import argilla as rg\n from argilla import Suggestion\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Get information from Argilla dataset definition\n dataset = rg.Dataset(\"my_dataset\")\n pending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\n completed_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\n pending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n )\n example_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n )\n field = dataset.settings.fields[\"text\"]\n question = dataset.settings.questions[\"label\"]\n\n # Initialize the labeller with the model and fields\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n )\n labeller.load()\n\n # Process the pending records\n result = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n )\n\n # Add the suggestions to the records\n for record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n # Log the updated records\n dataset.records.log(pending_records)\n ```\n\n Annotate a record with alternating datasets and questions:\n\n ```python\n import argilla as rg\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Get information from Argilla dataset definition\n dataset = rg.Dataset(\"my_dataset\")\n field = dataset.settings.fields[\"text\"]\n question = dataset.settings.questions[\"label\"]\n question2 = dataset.settings.questions[\"label2\"]\n\n # Initialize the labeller with the model and fields\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n labeller.load()\n\n # Process the record\n record = next(dataset.records())\n result = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n )\n\n # Add the suggestions to the record\n for suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n # Log the updated record\n dataset.records.log([record])\n ```\n\n Overwrite default prompts and instructions:\n\n ```python\n import argilla as rg\n from distilabel.steps.tasks import ArgillaLabeller\n from distilabel.models import InferenceEndpointsLLM\n\n # Overwrite default prompts and instructions\n labeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n )\n labeller.load()\n ```\n \"\"\"\n\n system_prompt: str = (\n \"You are an expert annotator and labelling assistant that understands complex domains and natural language processing. \"\n \"You are given input fields and a question. \"\n \"You should create a valid JSON object as an response to the question based on the input fields. \"\n )\n question_to_label_instruction: Dict[str, str] = {\n \"label_selection\": \"Select the appropriate label for the fields from the list of optional labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels for the fields from the list of optional labels.\",\n \"text\": \"Provide a response to the question based on the fields.\",\n \"rating\": \"Provide a rating for the question based on the fields.\",\n }\n example_records: Optional[\n RuntimeParameter[Union[List[Union[Dict[str, Any], BaseModel]], None]]\n ] = Field(\n default=None,\n description=\"The few shot serialized example records or `BaseModel`s with responses to be used to answer the question.\",\n )\n fields: Optional[\n RuntimeParameter[Union[List[Union[BaseModel, Dict[str, Any]]], None]]\n ] = Field(\n default=None,\n description=\"The field serialized field settings or `BaseModel` for the fields to be used to answer the question.\",\n )\n question: Optional[\n RuntimeParameter[\n Union[\n Dict[str, Any],\n BaseModel,\n None,\n ]\n ]\n ] = Field(\n default=None,\n description=\"The question serialized question settings or `BaseModel` for the question to be answered.\",\n )\n guidelines: Optional[RuntimeParameter[str]] = Field(\n default=None,\n description=\"The guidelines for the annotation task.\",\n )\n\n _template: Union[Template, None] = PrivateAttr(...)\n _client: Optional[Any] = PrivateAttr(None)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"argillalabeller.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> Dict[str, bool]:\n return {\n \"record\": True,\n \"fields\": False,\n \"question\": False,\n \"example_records\": False,\n \"guidelines\": False,\n }\n\n def _format_record(\n self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n ) -> str:\n \"\"\"Format the record fields into a string.\n\n Args:\n record (Dict[str, Any]): The record to format.\n fields (List[Dict[str, Any]]): The fields to format.\n\n Returns:\n str: The formatted record fields.\n \"\"\"\n output = []\n for field in fields:\n output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n return \"fields: \" + \"\\n\".join(output)\n\n def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n \"\"\"Get the label instruction for the question.\n\n Args:\n question (Dict[str, Any]): The question to get the label instruction for.\n\n Returns:\n str: The label instruction for the question.\n \"\"\"\n question_type = question[\"settings\"][\"type\"]\n return self.question_to_label_instruction[question_type]\n\n def _format_question(self, question: Dict[str, Any]) -> str:\n \"\"\"Format the question settings into a string.\n\n Args:\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted question.\n \"\"\"\n output = []\n output.append(f\"question: {self._get_label_instruction(question)}\")\n if \"options\" in question.get(\"settings\", {}):\n output.append(\n f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n )\n return \"\\n\".join(output)\n\n def _format_example_records(\n self,\n records: List[Dict[str, Any]],\n fields: List[Dict[str, Any]],\n question: Dict[str, Any],\n ) -> str:\n \"\"\"Format the example records into a string.\n\n Args:\n records (List[Dict[str, Any]]): The records to format.\n fields (List[Dict[str, Any]]): The fields to format.\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted example records.\n \"\"\"\n base = []\n for record in records:\n responses = record.get(\"responses\", {})\n if responses.get(question[\"name\"]):\n base.append(self._format_record(record, fields))\n value = responses[question[\"name\"]][0][\"value\"]\n formatted_value = self._assign_value_to_question_value_model(\n value, question\n )\n base.append(f\"response: {formatted_value}\")\n base.append(\"\")\n else:\n warnings.warn(\n f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n stacklevel=2,\n )\n return \"\\n\".join(base)\n\n def format_input(\n self,\n input: Dict[\n str,\n Union[\n Dict[str, Any],\n \"Record\",\n \"TextField\",\n \"MultiLabelQuestion\",\n \"LabelQuestion\",\n \"RatingQuestion\",\n \"TextQuestion\",\n ],\n ],\n ) -> \"ChatType\":\n \"\"\"Format the input into a chat message.\n\n Args:\n input: The input to format.\n\n Returns:\n The formatted chat message.\n\n Raises:\n ValueError: If question or fields are not provided.\n \"\"\"\n input_keys = list(self.inputs.keys())\n record = input[input_keys[0]]\n fields = input.get(input_keys[1], self.fields)\n question = input.get(input_keys[2], self.question)\n examples = input.get(input_keys[3], self.example_records)\n guidelines = input.get(input_keys[4], self.guidelines)\n\n if question is None:\n raise ValueError(\"Question must be provided.\")\n if fields is None or any(field is None for field in fields):\n raise ValueError(\"Fields must be provided.\")\n\n record = record.to_dict() if not isinstance(record, dict) else record\n question = question.serialize() if not isinstance(question, dict) else question\n fields = [\n field.serialize() if not isinstance(field, dict) else field\n for field in fields\n ]\n examples = (\n [\n example.to_dict() if not isinstance(example, dict) else example\n for example in examples\n ]\n if examples\n else None\n )\n\n formatted_fields = self._format_record(record, fields)\n formatted_question = self._format_question(question)\n formatted_examples = (\n self._format_example_records(examples, fields, question)\n if examples\n else False\n )\n\n prompt = self._template.render(\n fields=formatted_fields,\n question=formatted_question,\n examples=formatted_examples,\n guidelines=guidelines,\n )\n\n messages = []\n if self.system_prompt:\n messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n messages.append({\"role\": \"user\", \"content\": prompt})\n return messages\n\n @property\n def outputs(self) -> List[str]:\n return [\"suggestion\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Format the output into a dictionary.\n\n Args:\n output (Union[str, None]): The output to format.\n input (Dict[str, Any]): The input to format.\n\n Returns:\n Dict[str, Any]: The formatted output.\n \"\"\"\n from argilla import Suggestion\n\n question: Union[\n Any,\n Dict[str, Any],\n LabelQuestion,\n MultiLabelQuestion,\n RatingQuestion,\n TextQuestion,\n None,\n ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n question = question.serialize() if not isinstance(question, dict) else question\n model = self._get_pydantic_model_of_structured_output(question)\n validated_output = model(**json.loads(output))\n value = self._get_value_from_question_value_model(validated_output)\n suggestion = Suggestion(\n value=value,\n question_name=question[\"name\"],\n type=\"model\",\n agent=self.llm.model_name,\n ).serialize()\n return {\n self.outputs[0]: {\n k: v\n for k, v in suggestion.items()\n if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n }\n }\n\n def _set_llm_structured_output_for_question(self, question: Dict[str, Any]) -> None:\n runtime_parameters = self.llm._runtime_parameters\n runtime_parameters.update(\n {\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": self._get_pydantic_model_of_structured_output(question),\n },\n }\n )\n self.llm.set_runtime_parameters(runtime_parameters)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Process the input through the task.\n\n Args:\n inputs (StepInput): The input to process.\n\n Returns:\n StepOutput: The output of the task.\n \"\"\"\n\n question_list = [input.get(\"question\", self.question) for input in inputs]\n fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n # check if any field for the field in fields is None\n for fields in fields_list:\n if any(field is None for field in fields):\n raise ValueError(\n \"Fields must be provided during init or through `process` method.\"\n )\n # check if any question is None\n if any(question is None for question in question_list):\n raise ValueError(\n \"Question must be provided during init or through `process` method.\"\n )\n question_list = [\n question.serialize() if not isinstance(question, dict) else question\n for question in question_list\n ]\n if not all(question == question_list[0] for question in question_list):\n warnings.warn(\n \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n stacklevel=2,\n )\n for input, question in zip(inputs, question_list):\n self._set_llm_structured_output_for_question(question)\n yield from super().process([input])\n else:\n question = question_list[0]\n self._set_llm_structured_output_for_question(question)\n yield from super().process(inputs)\n\n def _get_value_from_question_value_model(\n self, question_value_model: BaseModel\n ) -> Any:\n \"\"\"Get the value from the question value model.\n\n Args:\n question_value_model (BaseModel): The question value model to get the value from.\n\n Returns:\n Any: The value from the question value model.\n \"\"\"\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n if hasattr(question_value_model, attr):\n return getattr(question_value_model, attr)\n raise ValueError(f\"Unsupported question type: {question_value_model}\")\n\n def _assign_value_to_question_value_model(\n self, value: Any, question: Dict[str, Any]\n ) -> BaseModel:\n \"\"\"Assign the value to the question value model.\n\n Args:\n value (Any): The value to assign.\n question (Dict[str, Any]): The question to assign the value to.\n\n Returns:\n BaseModel: The question value model with the assigned value.\n \"\"\"\n question_value_model = self._get_pydantic_model_of_structured_output(question)\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n try:\n model_dict = {attr: value}\n question_value_model = question_value_model(**model_dict)\n return question_value_model.model_dump_json()\n except AttributeError:\n pass\n return value\n\n def _get_pydantic_model_of_structured_output(\n self,\n question: Dict[str, Any],\n ) -> BaseModel:\n \"\"\"Get the Pydantic model of the structured output.\n\n Args:\n question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n Returns:\n BaseModel: The Pydantic model of the structured output.\n \"\"\"\n\n question_type = question[\"settings\"][\"type\"]\n\n if question_type == \"multi_label_selection\":\n\n class QuestionValueModel(BaseModel):\n labels: Optional[List[str]] = Field(default_factory=list)\n\n elif question_type == \"label_selection\":\n\n class QuestionValueModel(BaseModel):\n label: str\n\n elif question_type == \"text\":\n\n class QuestionValueModel(BaseModel):\n text: str\n\n elif question_type == \"rating\":\n\n class QuestionValueModel(BaseModel):\n rating: int\n else:\n raise ValueError(f\"Unsupported question type: {question}\")\n\n return QuestionValueModel\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/argilla_labeller.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"argillalabeller.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_record","title":"_format_record(record, fields) ","text":"Format the record fields into a string. Parameters: Name Type Description Default record Dict[str, Any] The record to format. required fields List[Dict[str, Any]] The fields to format. required Returns: Name Type Description str str The formatted record fields. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_record(\n self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n) -> str:\n \"\"\"Format the record fields into a string.\n\n Args:\n record (Dict[str, Any]): The record to format.\n fields (List[Dict[str, Any]]): The fields to format.\n\n Returns:\n str: The formatted record fields.\n \"\"\"\n output = []\n for field in fields:\n output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n return \"fields: \" + \"\\n\".join(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_label_instruction","title":"_get_label_instruction(question) ","text":"Get the label instruction for the question. Parameters: Name Type Description Default question Dict[str, Any] The question to get the label instruction for. required Returns: Name Type Description str str The label instruction for the question. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n \"\"\"Get the label instruction for the question.\n\n Args:\n question (Dict[str, Any]): The question to get the label instruction for.\n\n Returns:\n str: The label instruction for the question.\n \"\"\"\n question_type = question[\"settings\"][\"type\"]\n return self.question_to_label_instruction[question_type]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_question","title":"_format_question(question) ","text":"Format the question settings into a string. Parameters: Name Type Description Default question Dict[str, Any] The question to format. required Returns: Name Type Description str str The formatted question. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_question(self, question: Dict[str, Any]) -> str:\n \"\"\"Format the question settings into a string.\n\n Args:\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted question.\n \"\"\"\n output = []\n output.append(f\"question: {self._get_label_instruction(question)}\")\n if \"options\" in question.get(\"settings\", {}):\n output.append(\n f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n )\n return \"\\n\".join(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_example_records","title":"_format_example_records(records, fields, question) ","text":"Format the example records into a string. Parameters: Name Type Description Default records List[Dict[str, Any]] The records to format. required fields List[Dict[str, Any]] The fields to format. required question Dict[str, Any] The question to format. required Returns: Name Type Description str str The formatted example records. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _format_example_records(\n self,\n records: List[Dict[str, Any]],\n fields: List[Dict[str, Any]],\n question: Dict[str, Any],\n) -> str:\n \"\"\"Format the example records into a string.\n\n Args:\n records (List[Dict[str, Any]]): The records to format.\n fields (List[Dict[str, Any]]): The fields to format.\n question (Dict[str, Any]): The question to format.\n\n Returns:\n str: The formatted example records.\n \"\"\"\n base = []\n for record in records:\n responses = record.get(\"responses\", {})\n if responses.get(question[\"name\"]):\n base.append(self._format_record(record, fields))\n value = responses[question[\"name\"]][0][\"value\"]\n formatted_value = self._assign_value_to_question_value_model(\n value, question\n )\n base.append(f\"response: {formatted_value}\")\n base.append(\"\")\n else:\n warnings.warn(\n f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n stacklevel=2,\n )\n return \"\\n\".join(base)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_input","title":"format_input(input) ","text":"Format the input into a chat message. Parameters: Name Type Description Default input Dict[str, Union[Dict[str, Any], Record, TextField, MultiLabelQuestion, LabelQuestion, RatingQuestion, TextQuestion]] The input to format. required Returns: Type Description ChatType The formatted chat message. Raises: Type Description ValueError If question or fields are not provided. Source code in src/distilabel/steps/tasks/argilla_labeller.py def format_input(\n self,\n input: Dict[\n str,\n Union[\n Dict[str, Any],\n \"Record\",\n \"TextField\",\n \"MultiLabelQuestion\",\n \"LabelQuestion\",\n \"RatingQuestion\",\n \"TextQuestion\",\n ],\n ],\n) -> \"ChatType\":\n \"\"\"Format the input into a chat message.\n\n Args:\n input: The input to format.\n\n Returns:\n The formatted chat message.\n\n Raises:\n ValueError: If question or fields are not provided.\n \"\"\"\n input_keys = list(self.inputs.keys())\n record = input[input_keys[0]]\n fields = input.get(input_keys[1], self.fields)\n question = input.get(input_keys[2], self.question)\n examples = input.get(input_keys[3], self.example_records)\n guidelines = input.get(input_keys[4], self.guidelines)\n\n if question is None:\n raise ValueError(\"Question must be provided.\")\n if fields is None or any(field is None for field in fields):\n raise ValueError(\"Fields must be provided.\")\n\n record = record.to_dict() if not isinstance(record, dict) else record\n question = question.serialize() if not isinstance(question, dict) else question\n fields = [\n field.serialize() if not isinstance(field, dict) else field\n for field in fields\n ]\n examples = (\n [\n example.to_dict() if not isinstance(example, dict) else example\n for example in examples\n ]\n if examples\n else None\n )\n\n formatted_fields = self._format_record(record, fields)\n formatted_question = self._format_question(question)\n formatted_examples = (\n self._format_example_records(examples, fields, question)\n if examples\n else False\n )\n\n prompt = self._template.render(\n fields=formatted_fields,\n question=formatted_question,\n examples=formatted_examples,\n guidelines=guidelines,\n )\n\n messages = []\n if self.system_prompt:\n messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n messages.append({\"role\": \"user\", \"content\": prompt})\n return messages\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_output","title":"format_output(output, input) ","text":"Format the output into a dictionary. Parameters: Name Type Description Default output Union[str, None] The output to format. required input Dict[str, Any] The input to format. required Returns: Type Description Dict[str, Any] Dict[str, Any]: The formatted output. Source code in src/distilabel/steps/tasks/argilla_labeller.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Format the output into a dictionary.\n\n Args:\n output (Union[str, None]): The output to format.\n input (Dict[str, Any]): The input to format.\n\n Returns:\n Dict[str, Any]: The formatted output.\n \"\"\"\n from argilla import Suggestion\n\n question: Union[\n Any,\n Dict[str, Any],\n LabelQuestion,\n MultiLabelQuestion,\n RatingQuestion,\n TextQuestion,\n None,\n ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n question = question.serialize() if not isinstance(question, dict) else question\n model = self._get_pydantic_model_of_structured_output(question)\n validated_output = model(**json.loads(output))\n value = self._get_value_from_question_value_model(validated_output)\n suggestion = Suggestion(\n value=value,\n question_name=question[\"name\"],\n type=\"model\",\n agent=self.llm.model_name,\n ).serialize()\n return {\n self.outputs[0]: {\n k: v\n for k, v in suggestion.items()\n if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n }\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.process","title":"process(inputs) ","text":"Process the input through the task. Parameters: Name Type Description Default inputs StepInput The input to process. required Returns: Name Type Description StepOutput StepOutput The output of the task. Source code in src/distilabel/steps/tasks/argilla_labeller.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Process the input through the task.\n\n Args:\n inputs (StepInput): The input to process.\n\n Returns:\n StepOutput: The output of the task.\n \"\"\"\n\n question_list = [input.get(\"question\", self.question) for input in inputs]\n fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n # check if any field for the field in fields is None\n for fields in fields_list:\n if any(field is None for field in fields):\n raise ValueError(\n \"Fields must be provided during init or through `process` method.\"\n )\n # check if any question is None\n if any(question is None for question in question_list):\n raise ValueError(\n \"Question must be provided during init or through `process` method.\"\n )\n question_list = [\n question.serialize() if not isinstance(question, dict) else question\n for question in question_list\n ]\n if not all(question == question_list[0] for question in question_list):\n warnings.warn(\n \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n stacklevel=2,\n )\n for input, question in zip(inputs, question_list):\n self._set_llm_structured_output_for_question(question)\n yield from super().process([input])\n else:\n question = question_list[0]\n self._set_llm_structured_output_for_question(question)\n yield from super().process(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_value_from_question_value_model","title":"_get_value_from_question_value_model(question_value_model) ","text":"Get the value from the question value model. Parameters: Name Type Description Default question_value_model BaseModel The question value model to get the value from. required Returns: Name Type Description Any Any The value from the question value model. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_value_from_question_value_model(\n self, question_value_model: BaseModel\n) -> Any:\n \"\"\"Get the value from the question value model.\n\n Args:\n question_value_model (BaseModel): The question value model to get the value from.\n\n Returns:\n Any: The value from the question value model.\n \"\"\"\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n if hasattr(question_value_model, attr):\n return getattr(question_value_model, attr)\n raise ValueError(f\"Unsupported question type: {question_value_model}\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._assign_value_to_question_value_model","title":"_assign_value_to_question_value_model(value, question) ","text":"Assign the value to the question value model. Parameters: Name Type Description Default value Any The value to assign. required question Dict[str, Any] The question to assign the value to. required Returns: Name Type Description BaseModel BaseModel The question value model with the assigned value. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _assign_value_to_question_value_model(\n self, value: Any, question: Dict[str, Any]\n) -> BaseModel:\n \"\"\"Assign the value to the question value model.\n\n Args:\n value (Any): The value to assign.\n question (Dict[str, Any]): The question to assign the value to.\n\n Returns:\n BaseModel: The question value model with the assigned value.\n \"\"\"\n question_value_model = self._get_pydantic_model_of_structured_output(question)\n for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n try:\n model_dict = {attr: value}\n question_value_model = question_value_model(**model_dict)\n return question_value_model.model_dump_json()\n except AttributeError:\n pass\n return value\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_pydantic_model_of_structured_output","title":"_get_pydantic_model_of_structured_output(question) ","text":"Get the Pydantic model of the structured output. Parameters: Name Type Description Default question Dict[str, Any] The question to get the Pydantic model of the structured output for. required Returns: Name Type Description BaseModel BaseModel The Pydantic model of the structured output. Source code in src/distilabel/steps/tasks/argilla_labeller.py def _get_pydantic_model_of_structured_output(\n self,\n question: Dict[str, Any],\n) -> BaseModel:\n \"\"\"Get the Pydantic model of the structured output.\n\n Args:\n question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n Returns:\n BaseModel: The Pydantic model of the structured output.\n \"\"\"\n\n question_type = question[\"settings\"][\"type\"]\n\n if question_type == \"multi_label_selection\":\n\n class QuestionValueModel(BaseModel):\n labels: Optional[List[str]] = Field(default_factory=list)\n\n elif question_type == \"label_selection\":\n\n class QuestionValueModel(BaseModel):\n label: str\n\n elif question_type == \"text\":\n\n class QuestionValueModel(BaseModel):\n text: str\n\n elif question_type == \"rating\":\n\n class QuestionValueModel(BaseModel):\n rating: int\n else:\n raise ValueError(f\"Unsupported question type: {question}\")\n\n return QuestionValueModel\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR","title":"CLAIR ","text":" Bases: Task Contrastive Learning from AI Revisions (CLAIR). CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise. Input columns - task (
str ): The task or instruction. - student_solution (
str ): An answer to the task that is to be revised. Output columns - revision (
str ): The revised text. - rational (
str ): The rational for the provided revision. - model_name (
str ): The name of the model used to generate the revision and rational. Categories - preference
- text-generation
References Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment APO and CLAIR - GitHub Repository Examples: Create contrastive preference pairs: from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n# 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n# {'role': 'user',\n# 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations: ```\n@misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n year={2024},\n eprint={2408.06266},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2408.06266},\n}\n```\n Source code in src/distilabel/steps/tasks/clair.py class CLAIR(Task):\n r\"\"\"Contrastive Learning from AI Revisions (CLAIR).\n\n CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting\n preference A `preferred` A\u2019 is much more contrastive and precise.\n\n Input columns:\n - task (`str`): The task or instruction.\n - student_solution (`str`): An answer to the task that is to be revised.\n\n Output columns:\n - revision (`str`): The revised text.\n - rational (`str`): The rational for the provided revision.\n - model_name (`str`): The name of the model used to generate the revision and rational.\n\n Categories:\n - preference\n - text-generation\n\n References:\n - [`Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment`](https://arxiv.org/abs/2408.06266v1)\n - [`APO and CLAIR - GitHub Repository`](https://github.com/ContextualAI/CLAIR_and_APO)\n\n Examples:\n Create contrastive preference pairs:\n\n ```python\n from distilabel.steps.tasks import CLAIR\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n )\n clair_task = CLAIR(llm=llm)\n\n clair_task.load()\n\n result = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n )\n # result\n # [{'task': 'How many gaps are there between the earth and the moon?',\n # 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n # 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n # 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n # 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n # 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n # 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n # {'role': 'user',\n # 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n\n ```\n @misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n year={2024},\n eprint={2408.06266},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2408.06266},\n }\n ```\n \"\"\"\n\n system_prompt: str = SYSTEM_PROMPT\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n super().load()\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"clair.jinja2\"\n )\n with open(_path, \"r\") as f:\n self._template = Template(f.read())\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"task\", \"student_solution\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"revision\", \"rational\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n task=input[\"task\"], student_solution=input[\"student_solution\"]\n ),\n },\n ]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return self._default_error()\n\n return self._format_output(output)\n\n def _format_output(self, output: Union[str, None]) -> Dict[str, Any]:\n if \"**Corrected Student Solution:**\" in output:\n splits = output.split(\"**Corrected Student Solution:**\")\n elif \"{corrected_student_solution}:\" in output:\n splits = output.split(\"{corrected_student_solution}:\")\n elif \"{corrected_student_solution}\" in output:\n splits = output.split(\"{corrected_student_solution}\")\n elif \"**Worsened Student Solution:**\" in output:\n splits = output.split(\"**Worsened Student Solution:**\")\n elif \"{worsened_student_solution}:\" in output:\n splits = output.split(\"{worsened_student_solution}:\")\n elif \"{worsened_student_solution}\" in output:\n splits = output.split(\"{worsened_student_solution}\")\n else:\n splits = None\n\n # Safety check when the output doesn't follow the expected format\n if not splits:\n return self._default_error()\n\n if len(splits) >= 2:\n revision = splits[1]\n revision = revision.strip(\"\\n\\n\").strip() # noqa: B005\n\n rational = splits[0]\n if \"{teacher_reasoning}\" in rational:\n rational = rational.split(\"{teacher_reasoning}\")[1].strip(\":\").strip()\n rational = rational.strip(\"\\n\\n\").strip() # noqa: B005\n else:\n return self._default_error()\n return {\"revision\": revision, \"rational\": rational}\n\n def _default_error(self) -> Dict[str, None]:\n return {\"revision\": None, \"rational\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/clair.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\"role\": \"system\", \"content\": self.system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n task=input[\"task\"], student_solution=input[\"student_solution\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction-response pair. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction-response pair. Source code in src/distilabel/steps/tasks/clair.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return self._default_error()\n\n return self._format_output(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer","title":"ComplexityScorer ","text":" Bases: Task Score instructions based on their complexity using an LLM . ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instructions (
List[str] ): The list of instructions to be scored. Output columns - scores (
List[float] ): The score for each instruction. - model_name (
str ): The model name used to generate the scores. Categories - scorer
- complexity
- instruction
References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evaluate the complexity of your instructions: from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n Generate structured output with default schema: from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n \"scores\": [\\n 1, \\n 2\\n ]\\n}'}}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/complexity_scorer.py class ComplexityScorer(Task):\n \"\"\"Score instructions based on their complexity using an `LLM`.\n\n `ComplexityScorer` is a pre-defined task used to rank a list of instructions based in\n their complexity. It's an implementation of the complexity score task from the paper\n 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection\n in Instruction Tuning'.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instructions (`List[str]`): The list of instructions to be scored.\n\n Output columns:\n - scores (`List[float]`): The score for each instruction.\n - model_name (`str`): The model name used to generate the scores.\n\n Categories:\n - scorer\n - complexity\n - instruction\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evaluate the complexity of your instructions:\n\n ```python\n from distilabel.steps.tasks import ComplexityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n )\n # result\n # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n ```\n\n Generate structured output with default schema:\n\n ```python\n from distilabel.steps.tasks import ComplexityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n )\n # result\n # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\\\n \"scores\": [\\\\n 1, \\\\n 2\\\\n ]\\\\n}'}}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"complexity-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are the `instructions`.\"\"\"\n return [\"instructions\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(instructions=input[\"instructions\"]), # type: ignore\n }\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are: a list of `scores` containing the complexity score for each\n instruction in `instructions`, and the `model_name`.\"\"\"\n return [\"scores\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"instructions\"]) - 1:\n break\n return {\"scores\": scores}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaComplexityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaComplexityScorer\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n @override\n def _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {\n \"instructions\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are the instructions . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.outputs","title":"outputs: List[str] property ","text":"The output for the task are: a list of scores containing the complexity score for each instruction in instructions , and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/complexity_scorer.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"complexity-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/complexity_scorer.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(instructions=input[\"instructions\"]), # type: ignore\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction. Source code in src/distilabel/steps/tasks/complexity_scorer.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"instructions\"]) - 1:\n break\n return {\"scores\": scores}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaComplexityScorer(BaseModel):\n scores: List[int]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/complexity_scorer.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaComplexityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaComplexityScorer\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/complexity_scorer.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"instructions\"])}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._sample_input","title":"_sample_input() ","text":"Returns a sample input to be used in the print method. Tasks that don't adhere to a format input that returns a map of the type str -> str should override this method to return a sample input. Source code in src/distilabel/steps/tasks/complexity_scorer.py @override\ndef _sample_input(self) -> \"ChatType\":\n \"\"\"Returns a sample input to be used in the `print` method.\n Tasks that don't adhere to a format input that returns a map of the type\n str -> str should override this method to return a sample input.\n \"\"\"\n return self.format_input(\n {\n \"instructions\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct","title":"EvolInstruct ","text":" Bases: Task Evolve instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions Attributes: Name Type Description num_evolutions int The number of evolutions to be performed. store_evolutions bool Whether to store all the evolutions or just the last one. Defaults to False . generate_answers bool Whether to generate answers for the evolved instructions. Defaults to False . include_original_instruction bool Whether to include the original instruction in the evolved_instructions output column. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file. seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters seed : The seed to be set for numpy in order to randomly pick a mutation method. Input columns - instruction (
str ): The instruction to evolve. Output columns - evolved_instruction (
str ): The evolved instruction if store_evolutions=False . - evolved_instructions (
List[str] ): The evolved instructions if store_evolutions=True . - model_name (
str ): The name of the LLM used to evolve the instructions. - answer (
str ): The answer to the evolved instruction if generate_answers=True and store_evolutions=False . - answers (
List[str] ): The answers to the evolved instructions if generate_answers=True and store_evolutions=True . Categories References - WizardLM: Empowering Large Language Models to Follow Complex Instructions
- GitHub: h2oai/h2o-wizardlm
Examples: Evolve an instruction using an LLM: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n Keep the iterations of the evolutions: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instructions': ['initial evolution', 'final evolution'],\n# 'model_name': 'model_name'\n# }\n# ]\n Generate answers for the instructions in a single step: from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instruction': 'evolved instruction',\n# 'answer': 'answer to the instruction',\n# 'model_name': 'model_name'\n# }\n# ]\n Citations @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/base.py class EvolInstruct(Task):\n \"\"\"Evolve instructions using an `LLM`.\n\n WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n Attributes:\n num_evolutions: The number of evolutions to be performed.\n store_evolutions: Whether to store all the evolutions or just the last one. Defaults\n to `False`.\n generate_answers: Whether to generate answers for the evolved instructions. Defaults\n to `False`.\n include_original_instruction: Whether to include the original instruction in the\n `evolved_instructions` output column. Defaults to `False`.\n mutation_templates: The mutation templates to be used for evolving the instructions.\n Defaults to the ones provided in the `utils.py` file.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Input columns:\n - instruction (`str`): The instruction to evolve.\n\n Output columns:\n - evolved_instruction (`str`): The evolved instruction if `store_evolutions=False`.\n - evolved_instructions (`List[str]`): The evolved instructions if `store_evolutions=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n - answer (`str`): The answer to the evolved instruction if `generate_answers=True`\n and `store_evolutions=False`.\n - answers (`List[str]`): The answers to the evolved instructions if `generate_answers=True`\n and `store_evolutions=True`.\n\n Categories:\n - evol\n - instruction\n\n References:\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n Examples:\n Evolve an instruction using an LLM:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n ```\n\n Keep the iterations of the evolutions:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'evolved_instructions': ['initial evolution', 'final evolution'],\n # 'model_name': 'model_name'\n # }\n # ]\n ```\n\n Generate answers for the instructions in a single step:\n\n ```python\n from distilabel.steps.tasks import EvolInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n )\n\n evol_instruct.load()\n\n result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'evolved_instruction': 'evolved instruction',\n # 'answer': 'answer to the instruction',\n # 'model_name': 'model_name'\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n num_evolutions: int\n store_evolutions: bool = False\n generate_answers: bool = False\n include_original_instruction: bool = False\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"instruction\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `evolved_instruction/s`, the `answer` if `generate_answers=True`\n and the `model_name`.\"\"\"\n # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n # this could be handled always and the value could be included within the DAG validation when\n # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n _outputs = [\n (\n \"evolved_instruction\"\n if not self.store_evolutions\n else \"evolved_instructions\"\n ),\n \"model_name\",\n ]\n if self.generate_answers:\n _outputs.append(\"answer\" if not self.store_evolutions else \"answers\")\n return _outputs\n\n @override\n def format_output( # type: ignore\n self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n Args:\n instructions: The instructions to be included within the output.\n answers: The answers to be included within the output if `generate_answers=True`.\n\n Returns:\n If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n if not self.store_evolutions:\n _output[\"evolved_instruction\"] = instructions[-1]\n else:\n _output[\"evolved_instructions\"] = instructions\n\n if self.generate_answers and answers:\n if not self.store_evolutions:\n _output[\"answer\"] = answers[-1]\n else:\n _output[\"answers\"] = answers\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, instruction: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction) # type: ignore\n\n def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n statistics: \"LLMStatistics\" = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction in instructions:\n formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n evolved_instructions = []\n for generated_prompt in generated_prompts:\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n evolved_instructions.append(generated_prompt)\n\n if self.store_evolutions:\n instructions = [\n instruction + [evolved_instruction]\n for instruction, evolved_instruction in zip(\n instructions, evolved_instructions\n )\n ]\n else:\n instructions = [\n [evolved_instruction]\n for evolved_instruction in evolved_instructions\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n )\n return instructions, dict(statistics)\n\n def _generate_answers(\n self, evolved_instructions: List[List[str]]\n ) -> Tuple[List[List[str]], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the instructions in `instructions`.\n\n Args:\n evolved_instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for each instruction.\n \"\"\"\n formatted_instructions = [\n self.format_input(instruction)\n for instructions in evolved_instructions\n for instruction in instructions\n ]\n\n responses = self.llm.generate(\n formatted_instructions,\n num_generations=1,\n **self.llm.generation_kwargs, # type: ignore\n )\n generations = [response[\"generations\"] for response in responses]\n\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n step = (\n self.num_evolutions\n if not self.include_original_instruction\n else self.num_evolutions + 1\n )\n\n return [\n flatten_responses(generations[i : i + step])\n for i in range(0, len(responses), step)\n ], dict(statistics)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n evolved_instructions, statistics = self._evolve_instructions(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_instructions` list\n from_ = 1 if not self.include_original_instruction else 0\n evolved_instructions = [\n instruction[from_:] for instruction in evolved_instructions\n ]\n\n if not self.generate_answers:\n for input, instruction in zip(inputs, evolved_instructions):\n input.update(self.format_output(instruction))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_instruction_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n self._logger.info(\n f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n )\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(evolved_instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n \" instructions!\"\n )\n\n for idx, (input, instruction) in enumerate(\n zip(inputs, evolved_instructions)\n ):\n input.update(self.format_output(instruction, answers[idx]))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_answer_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n self._apply_random_mutation(\"<PLACEHOLDER_INSTRUCTION>\")\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.outputs","title":"outputs: List[str] property ","text":"The output for the task are the evolved_instruction/s , the answer if generate_answers=True and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_output","title":"format_output(instructions, answers=None) ","text":"The output for the task is a dict with: evolved_instruction or evolved_instructions , depending whether the value is either False or True for store_evolutions , respectively; answer if generate_answers=True ; and, finally, the model_name . Parameters: Name Type Description Default instructions Union[str, List[str]] The instructions to be included within the output. required answers Optional[List[str]] The answers to be included within the output if generate_answers=True . None Returns: Type Description Dict[str, Any] If store_evolutions=False and generate_answers=True return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...}; Dict[str, Any] if store_evolutions=True and generate_answers=True return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...}; Dict[str, Any] if store_evolutions=False and generate_answers=False return {\"evolved_instruction\": ..., \"model_name\": ...}; Dict[str, Any] if store_evolutions=True and generate_answers=False return {\"evolved_instructions\": ..., \"model_name\": ...}. Source code in src/distilabel/steps/tasks/evol_instruct/base.py @override\ndef format_output( # type: ignore\n self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n Args:\n instructions: The instructions to be included within the output.\n answers: The answers to be included within the output if `generate_answers=True`.\n\n Returns:\n If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n if not self.store_evolutions:\n _output[\"evolved_instruction\"] = instructions[-1]\n else:\n _output[\"evolved_instructions\"] = instructions\n\n if self.generate_answers and answers:\n if not self.store_evolutions:\n _output[\"answer\"] = answers[-1]\n else:\n _output[\"answers\"] = answers\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._apply_random_mutation","title":"_apply_random_mutation(instruction) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default instruction str The instruction to be included within the mutation prompt. required Returns: Type Description str A random mutation prompt with the provided instruction. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _apply_random_mutation(self, instruction: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction) # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._evolve_instructions","title":"_evolve_instructions(inputs) ","text":"Evolves the instructions provided as part of the inputs of the task. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description List[List[str]] A list where each item is a list with either the last evolved instruction if List[List[str]] store_evolutions=False or all the evolved instructions if store_evolutions=True . Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n statistics: \"LLMStatistics\" = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction in instructions:\n formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n evolved_instructions = []\n for generated_prompt in generated_prompts:\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n evolved_instructions.append(generated_prompt)\n\n if self.store_evolutions:\n instructions = [\n instruction + [evolved_instruction]\n for instruction, evolved_instruction in zip(\n instructions, evolved_instructions\n )\n ]\n else:\n instructions = [\n [evolved_instruction]\n for evolved_instruction in evolved_instructions\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n )\n return instructions, dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._generate_answers","title":"_generate_answers(evolved_instructions) ","text":"Generates the answer for the instructions in instructions . Parameters: Name Type Description Default evolved_instructions List[List[str]] A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True . required Returns: Type Description Tuple[List[List[str]], LLMStatistics] A list of answers for each instruction. Source code in src/distilabel/steps/tasks/evol_instruct/base.py def _generate_answers(\n self, evolved_instructions: List[List[str]]\n) -> Tuple[List[List[str]], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the instructions in `instructions`.\n\n Args:\n evolved_instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for each instruction.\n \"\"\"\n formatted_instructions = [\n self.format_input(instruction)\n for instructions in evolved_instructions\n for instruction in instructions\n ]\n\n responses = self.llm.generate(\n formatted_instructions,\n num_generations=1,\n **self.llm.generation_kwargs, # type: ignore\n )\n generations = [response[\"generations\"] for response in responses]\n\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n step = (\n self.num_evolutions\n if not self.include_original_instruction\n else self.num_evolutions + 1\n )\n\n return [\n flatten_responses(generations[i : i + step])\n for i in range(0, len(responses), step)\n ], dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/evol_instruct/base.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n evolved_instructions, statistics = self._evolve_instructions(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_instructions` list\n from_ = 1 if not self.include_original_instruction else 0\n evolved_instructions = [\n instruction[from_:] for instruction in evolved_instructions\n ]\n\n if not self.generate_answers:\n for input, instruction in zip(inputs, evolved_instructions):\n input.update(self.format_output(instruction))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_instruction_{self.name}\": statistics\n }\n }\n )\n yield inputs\n\n self._logger.info(\n f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n )\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(evolved_instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n \" instructions!\"\n )\n\n for idx, (input, instruction) in enumerate(\n zip(inputs, evolved_instructions)\n ):\n input.update(self.format_output(instruction, answers[idx]))\n input.update(\n {\n \"distilabel_metadata\": {\n f\"statistics_answer_{self.name}\": statistics\n }\n }\n )\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexity","title":"EvolComplexity ","text":" Bases: EvolInstruct Evolve instructions to make them more complex using an LLM . EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach. Attributes: Name Type Description num_instructions The number of instructions to be generated. generate_answers Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed Dict[str, str] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The number of evolutions to be run. Input columns - instruction (
str ): The instruction to evolve. Output columns - evolved_instruction (
str ): The evolved instruction. - answer (
str , optional): The answer to the instruction if generate_answers=True . - model_name (
str ): The name of the LLM used to evolve the instructions. Categories References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
- WizardLM: Empowering Large Language Models to Follow Complex Instructions
Examples: Evolve an instruction using an LLM: from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py class EvolComplexity(EvolInstruct):\n \"\"\"Evolve instructions to make them more complex using an `LLM`.\n\n `EvolComplexity` is a task that evolves instructions to make them more complex,\n and it is based in the EvolInstruct task, using slight different prompts, but the\n exact same evolutionary approach.\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n - `seed`: The number of evolutions to be run.\n\n Input columns:\n - instruction (`str`): The instruction to evolve.\n\n Output columns:\n - evolved_instruction (`str`): The evolved instruction.\n - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - deita\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n Examples:\n Evolve an instruction using an LLM:\n\n ```python\n from distilabel.steps.tasks import EvolComplexity\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_complexity.load()\n\n result = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n # result\n # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexityGenerator","title":"EvolComplexityGenerator ","text":" Bases: EvolInstructGenerator Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach. Attributes: Name Type Description num_instructions The number of instructions to be generated. generate_answers Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length Dict[str, str] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed Dict[str, str] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The number of evolutions to be run. Output columns - instruction (
str ): The evolved instruction. - answer (
str , optional): The answer to the instruction if generate_answers=True . - model_name (
str ): The name of the LLM used to evolve the instructions. Categories - evol
- instruction
- generation
- deita
References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
- WizardLM: Empowering Large Language Models to Follow Complex Instructions
Examples: Generate evolved instructions without initial instructions: from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py class EvolComplexityGenerator(EvolInstructGenerator):\n \"\"\"Generate evolved instructions with increased complexity using an `LLM`.\n\n `EvolComplexityGenerator` is a generation task that evolves instructions to make\n them more complex, and it is based in the EvolInstruct task, but using slight different\n prompts, but the exact same evolutionary approach.\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n - `seed`: The number of evolutions to be run.\n\n Output columns:\n - instruction (`str`): The evolved instruction.\n - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - generation\n - deita\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n Examples:\n Generate evolved instructions without initial instructions:\n\n ```python\n from distilabel.steps.tasks import EvolComplexityGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n )\n\n evol_complexity_generator.load()\n\n result = next(scorer.process())\n # result\n # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator","title":"EvolInstructGenerator ","text":" Bases: GeneratorTask Generate evolved instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions Attributes: Name Type Description num_instructions int The number of instructions to be generated. generate_answers bool Whether to generate answers for the instructions or not. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used for the generation of the instructions. min_length RuntimeParameter[int] Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . max_length RuntimeParameter[int] Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters min_length : Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. max_length : Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. seed : The seed to be set for numpy in order to randomly pick a mutation method. Output columns - instruction (
str ): The generated instruction if generate_answers=False . - answer (
str ): The generated answer if generate_answers=True . - instructions (
List[str] ): The generated instructions if generate_answers=True . - model_name (
str ): The name of the LLM used to generate and evolve the instructions. Categories - evol
- instruction
- generation
References - WizardLM: Empowering Large Language Models to Follow Complex Instructions
- GitHub: h2oai/h2o-wizardlm
Examples: Generate evolved instructions without initial instructions: from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n Citations @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n}\n Source code in src/distilabel/steps/tasks/evol_instruct/generator.py class EvolInstructGenerator(GeneratorTask):\n \"\"\"Generate evolved instructions using an `LLM`.\n\n WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n Attributes:\n num_instructions: The number of instructions to be generated.\n generate_answers: Whether to generate answers for the instructions or not. Defaults\n to `False`.\n mutation_templates: The mutation templates to be used for the generation of the\n instructions.\n min_length: Defines the length (in bytes) that the generated instruction needs to\n be higher than, to be considered valid. Defaults to `512`.\n max_length: Defines the length (in bytes) that the generated instruction needs to\n be lower than, to be considered valid. Defaults to `1024`.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `min_length`: Defines the length (in bytes) that the generated instruction needs\n to be higher than, to be considered valid.\n - `max_length`: Defines the length (in bytes) that the generated instruction needs\n to be lower than, to be considered valid.\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Output columns:\n - instruction (`str`): The generated instruction if `generate_answers=False`.\n - answer (`str`): The generated answer if `generate_answers=True`.\n - instructions (`List[str]`): The generated instructions if `generate_answers=True`.\n - model_name (`str`): The name of the LLM used to generate and evolve the instructions.\n\n Categories:\n - evol\n - instruction\n - generation\n\n References:\n - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n Examples:\n Generate evolved instructions without initial instructions:\n\n ```python\n from distilabel.steps.tasks import EvolInstructGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n )\n\n evol_instruct_generator.load()\n\n result = next(scorer.process())\n # result\n # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n ```\n\n Citations:\n ```\n @misc{xu2023wizardlmempoweringlargelanguage,\n title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n year={2023},\n eprint={2304.12244},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2304.12244},\n }\n ```\n \"\"\"\n\n num_instructions: int\n generate_answers: bool = False\n mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n\n min_length: RuntimeParameter[int] = Field(\n default=512,\n description=\"Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\",\n )\n max_length: RuntimeParameter[int] = Field(\n default=1024,\n description=\"Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\",\n )\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n )\n _seed_texts: Optional[List[str]] = PrivateAttr(default_factory=list)\n _prompts: Optional[List[str]] = PrivateAttr(default_factory=list)\n\n def _generate_seed_texts(self) -> List[str]:\n \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n a list of English words will be used to generate the seed texts that will be provided to the\n mutation method and included within the prompt.\n\n Returns:\n A list of seed texts to be used as part of the starting prompts for the task.\n \"\"\"\n seed_texts = []\n for _ in range(self.num_instructions * 10):\n num_words = np.random.choice([1, 2, 3, 4])\n seed_texts.append(\n self.mutation_templates[\"FRESH_START\"].replace( # type: ignore\n \"<PROMPT>\",\n \", \".join(\n [\n np.random.choice(self._english_nouns).strip()\n for _ in range(num_words)\n ]\n ),\n )\n )\n return seed_texts\n\n @override\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n np.random.seed(self.seed)\n\n self._seed_texts = self._generate_seed_texts()\n self._prompts = [\n np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n ]\n\n @cached_property\n def _english_nouns(self) -> List[str]:\n \"\"\"A list of English nouns to be used as part of the starting prompts for the task.\n\n References:\n - https://github.com/h2oai/h2o-wizardlm\n \"\"\"\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps/tasks/evol_instruct/english_nouns.txt\"\n )\n with open(_path, mode=\"r\") as f:\n return [line.strip() for line in f.readlines()]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `instruction`, the `answer` if `generate_answers=True`\n and the `model_name`.\"\"\"\n _outputs = [\"instruction\", \"model_name\"]\n if self.generate_answers:\n _outputs.append(\"answer\")\n return _outputs\n\n def format_output( # type: ignore\n self, instruction: str, answer: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n and, finally, the `model_name`.\n\n Args:\n instruction: The instruction to be included within the output.\n answer: The answer to be included within the output if `generate_answers=True`.\n\n Returns:\n If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n \"\"\"\n _output = {\n \"instruction\": instruction,\n \"model_name\": self.llm.model_name,\n }\n if self.generate_answers and answer is not None:\n _output[\"answer\"] = answer\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n iter_no: The iteration number to be used to check whether the iteration is the\n first one i.e. FRESH_START, or not.\n\n Returns:\n A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n \"\"\"\n prompts = []\n for idx in range(self.num_instructions):\n if (\n iter_no == 0\n or \"Write one question or request containing\" in self._prompts[idx] # type: ignore\n ):\n mutation = \"FRESH_START\"\n else:\n mutation = np.random.choice(self.mutation_templates_names)\n if mutation == \"FRESH_START\":\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n\n prompt_with_template = (\n self.mutation_templates[mutation].replace( # type: ignore\n \"<PROMPT>\",\n self._prompts[idx], # type: ignore\n ) # type: ignore\n if iter_no != 0\n else self._prompts[idx] # type: ignore\n )\n prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n return prompts\n\n def _generate_answers(\n self, instructions: List[List[str]]\n ) -> Tuple[List[str], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the last instruction in `instructions`.\n\n Args:\n instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for the last instruction in `instructions`.\n \"\"\"\n # TODO: update to generate answers for all the instructions\n _formatted_instructions = [\n [{\"role\": \"user\", \"content\": instruction[-1]}]\n for instruction in instructions\n ]\n responses = self.llm.generate(\n _formatted_instructions,\n **self.llm.generation_kwargs, # type: ignore\n )\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n return flatten_responses(\n [response[\"generations\"] for response in responses]\n ), dict(statistics)\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\": # NOQA: C901, type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n A list of Python dictionaries with the outputs of the task, and a boolean\n flag indicating whether the task has finished or not i.e. is the last batch.\n \"\"\"\n instructions = []\n mutation_no = 0\n\n # TODO: update to take into account `offset`\n iter_no = 0\n while len(instructions) < self.num_instructions:\n prompts = self._apply_random_mutation(iter_no=iter_no)\n\n # TODO: Update the function to extract from the dict\n responses = self.llm.generate(prompts, **self.llm.generation_kwargs) # type: ignore\n\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n statistics: \"LLMStatistics\" = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n for idx, generated_prompt in enumerate(generated_prompts):\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n if self.max_length >= len(generated_prompt) >= self.min_length: # type: ignore\n instructions.append(generated_prompt)\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n else:\n self._prompts[idx] = generated_prompt # type: ignore\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n )\n iter_no += 1\n\n if len(instructions) > self.num_instructions:\n instructions = instructions[: self.num_instructions]\n if len(instructions) > mutation_no:\n mutation_no = len(instructions) - mutation_no\n\n if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n formatted_generations = []\n for mutated_instruction in instructions[-mutation_no:]:\n mutated_instruction = self.format_output(mutated_instruction)\n mutated_instruction[\"distilabel_metadata\"] = {\n f\"statistics_instruction_{self.name}\": dict(statistics)\n }\n formatted_generations.append(mutated_instruction)\n yield (\n formatted_generations,\n len(instructions) >= self.num_instructions,\n )\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n formatted_outputs = []\n for instruction, answer in zip(instructions, answers):\n formatted_output = self.format_output(instruction, answer)\n formatted_output[\"distilabel_metadata\"] = {\n f\"statistics_answer_{self.name}\": dict(statistics)\n }\n formatted_outputs.append(formatted_output)\n\n yield (\n formatted_outputs,\n True,\n )\n\n @override\n def _sample_input(self) -> \"ChatType\":\n return self._apply_random_mutation(iter_no=0)[0]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._english_nouns","title":"_english_nouns: List[str] cached property ","text":"A list of English nouns to be used as part of the starting prompts for the task. References - https://github.com/h2oai/h2o-wizardlm
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.outputs","title":"outputs: List[str] property ","text":"The output for the task are the instruction , the answer if generate_answers=True and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_seed_texts","title":"_generate_seed_texts() ","text":"Generates a list of seed texts to be used as part of the starting prompts for the task. It will use the FRESH_START mutation template, as it needs to generate text from scratch; and a list of English words will be used to generate the seed texts that will be provided to the mutation method and included within the prompt. Returns: Type Description List[str] A list of seed texts to be used as part of the starting prompts for the task. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _generate_seed_texts(self) -> List[str]:\n \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n a list of English words will be used to generate the seed texts that will be provided to the\n mutation method and included within the prompt.\n\n Returns:\n A list of seed texts to be used as part of the starting prompts for the task.\n \"\"\"\n seed_texts = []\n for _ in range(self.num_instructions * 10):\n num_words = np.random.choice([1, 2, 3, 4])\n seed_texts.append(\n self.mutation_templates[\"FRESH_START\"].replace( # type: ignore\n \"<PROMPT>\",\n \", \".join(\n [\n np.random.choice(self._english_nouns).strip()\n for _ in range(num_words)\n ]\n ),\n )\n )\n return seed_texts\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.model_post_init","title":"model_post_init(__context) ","text":"Override this method to perform additional initialization after __init__ and model_construct . This is useful if you want to do some validation that requires the entire model to be initialized. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py @override\ndef model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n np.random.seed(self.seed)\n\n self._seed_texts = self._generate_seed_texts()\n self._prompts = [\n np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.format_output","title":"format_output(instruction, answer=None) ","text":"The output for the task is a dict with: instruction ; answer if generate_answers=True ; and, finally, the model_name . Parameters: Name Type Description Default instruction str The instruction to be included within the output. required answer Optional[str] The answer to be included within the output if generate_answers=True . None Returns: Type Description Dict[str, Any] If generate_answers=True return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...}; Dict[str, Any] if generate_answers=False return {\"instruction\": ..., \"model_name\": ...}; Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def format_output( # type: ignore\n self, instruction: str, answer: Optional[str] = None\n) -> Dict[str, Any]:\n \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n and, finally, the `model_name`.\n\n Args:\n instruction: The instruction to be included within the output.\n answer: The answer to be included within the output if `generate_answers=True`.\n\n Returns:\n If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n \"\"\"\n _output = {\n \"instruction\": instruction,\n \"model_name\": self.llm.model_name,\n }\n if self.generate_answers and answer is not None:\n _output[\"answer\"] = answer\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._apply_random_mutation","title":"_apply_random_mutation(iter_no) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default iter_no int The iteration number to be used to check whether the iteration is the first one i.e. FRESH_START, or not. required Returns: Type Description List[ChatType] A random mutation prompt with the provided instruction formatted as an OpenAI conversation. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n iter_no: The iteration number to be used to check whether the iteration is the\n first one i.e. FRESH_START, or not.\n\n Returns:\n A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n \"\"\"\n prompts = []\n for idx in range(self.num_instructions):\n if (\n iter_no == 0\n or \"Write one question or request containing\" in self._prompts[idx] # type: ignore\n ):\n mutation = \"FRESH_START\"\n else:\n mutation = np.random.choice(self.mutation_templates_names)\n if mutation == \"FRESH_START\":\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n\n prompt_with_template = (\n self.mutation_templates[mutation].replace( # type: ignore\n \"<PROMPT>\",\n self._prompts[idx], # type: ignore\n ) # type: ignore\n if iter_no != 0\n else self._prompts[idx] # type: ignore\n )\n prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n return prompts\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_answers","title":"_generate_answers(instructions) ","text":"Generates the answer for the last instruction in instructions . Parameters: Name Type Description Default instructions List[List[str]] A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True . required Returns: Type Description Tuple[List[str], LLMStatistics] A list of answers for the last instruction in instructions . Source code in src/distilabel/steps/tasks/evol_instruct/generator.py def _generate_answers(\n self, instructions: List[List[str]]\n) -> Tuple[List[str], \"LLMStatistics\"]:\n \"\"\"Generates the answer for the last instruction in `instructions`.\n\n Args:\n instructions: A list of lists where each item is a list with either the last\n evolved instruction if `store_evolutions=False` or all the evolved instructions\n if `store_evolutions=True`.\n\n Returns:\n A list of answers for the last instruction in `instructions`.\n \"\"\"\n # TODO: update to generate answers for all the instructions\n _formatted_instructions = [\n [{\"role\": \"user\", \"content\": instruction[-1]}]\n for instruction in instructions\n ]\n responses = self.llm.generate(\n _formatted_instructions,\n **self.llm.generation_kwargs, # type: ignore\n )\n statistics: Dict[str, Any] = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n return flatten_responses(\n [response[\"generations\"] for response in responses]\n ), dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.process","title":"process(offset=0) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0. 0 Yields: Type Description GeneratorStepOutput A list of Python dictionaries with the outputs of the task, and a boolean GeneratorStepOutput flag indicating whether the task has finished or not i.e. is the last batch. Source code in src/distilabel/steps/tasks/evol_instruct/generator.py @override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\": # NOQA: C901, type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n offset: The offset to start the generation from. Defaults to 0.\n\n Yields:\n A list of Python dictionaries with the outputs of the task, and a boolean\n flag indicating whether the task has finished or not i.e. is the last batch.\n \"\"\"\n instructions = []\n mutation_no = 0\n\n # TODO: update to take into account `offset`\n iter_no = 0\n while len(instructions) < self.num_instructions:\n prompts = self._apply_random_mutation(iter_no=iter_no)\n\n # TODO: Update the function to extract from the dict\n responses = self.llm.generate(prompts, **self.llm.generation_kwargs) # type: ignore\n\n generated_prompts = flatten_responses(\n [response[\"generations\"] for response in responses]\n )\n statistics: \"LLMStatistics\" = defaultdict(list)\n for response in responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n for idx, generated_prompt in enumerate(generated_prompts):\n generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n if self.max_length >= len(generated_prompt) >= self.min_length: # type: ignore\n instructions.append(generated_prompt)\n self._prompts[idx] = np.random.choice(self._seed_texts) # type: ignore\n else:\n self._prompts[idx] = generated_prompt # type: ignore\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n )\n iter_no += 1\n\n if len(instructions) > self.num_instructions:\n instructions = instructions[: self.num_instructions]\n if len(instructions) > mutation_no:\n mutation_no = len(instructions) - mutation_no\n\n if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n formatted_generations = []\n for mutated_instruction in instructions[-mutation_no:]:\n mutated_instruction = self.format_output(mutated_instruction)\n mutated_instruction[\"distilabel_metadata\"] = {\n f\"statistics_instruction_{self.name}\": dict(statistics)\n }\n formatted_generations.append(mutated_instruction)\n yield (\n formatted_generations,\n len(instructions) >= self.num_instructions,\n )\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n if self.generate_answers:\n self._logger.info(\n f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n answers, statistics = self._generate_answers(instructions)\n\n self._logger.info(\n f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n )\n\n formatted_outputs = []\n for instruction, answer in zip(instructions, answers):\n formatted_output = self.format_output(instruction, answer)\n formatted_output[\"distilabel_metadata\"] = {\n f\"statistics_answer_{self.name}\": dict(statistics)\n }\n formatted_outputs.append(formatted_output)\n\n yield (\n formatted_outputs,\n True,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality","title":"EvolQuality ","text":" Bases: Task Evolve the quality of the responses using an LLM . EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description num_evolutions int The number of evolutions to be performed on the responses. store_evolutions bool Whether to store all the evolved responses or just the last one. Defaults to False . include_original_response bool Whether to include the original response within the evolved responses. Defaults to False . mutation_templates Dict[str, str] The mutation templates to be used to evolve the responses. seed RuntimeParameter[int] The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . Runtime parameters seed : The seed to be set for numpy in order to randomly pick a mutation method. Input columns - instruction (
str ): The instruction that was used to generate the responses . - response (
str ): The responses to be rewritten. Output columns - evolved_response (
str ): The evolved response if store_evolutions=False . - evolved_responses (
List[str] ): The evolved responses if store_evolutions=True . - model_name (
str ): The name of the LLM used to evolve the responses. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evolve the quality of the responses given a prompt: from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'response': 'a response',\n# 'evolved_response': 'evolved response',\n# 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n# }\n# ]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/evol_quality/base.py class EvolQuality(Task):\n \"\"\"Evolve the quality of the responses using an `LLM`.\n\n `EvolQuality` task is used to evolve the quality of the responses given a prompt,\n by generating a new response with a language model. This step implements the evolution\n quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n num_evolutions: The number of evolutions to be performed on the responses.\n store_evolutions: Whether to store all the evolved responses or just the last one.\n Defaults to `False`.\n include_original_response: Whether to include the original response within the evolved\n responses. Defaults to `False`.\n mutation_templates: The mutation templates to be used to evolve the responses.\n seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n Defaults to `42`.\n\n Runtime parameters:\n - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the `responses`.\n - response (`str`): The responses to be rewritten.\n\n Output columns:\n - evolved_response (`str`): The evolved response if `store_evolutions=False`.\n - evolved_responses (`List[str]`): The evolved responses if `store_evolutions=True`.\n - model_name (`str`): The name of the LLM used to evolve the responses.\n\n Categories:\n - evol\n - response\n - deita\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evolve the quality of the responses given a prompt:\n\n ```python\n from distilabel.steps.tasks import EvolQuality\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n evol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n )\n\n evol_quality.load()\n\n result = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'common instruction',\n # 'response': 'a response',\n # 'evolved_response': 'evolved response',\n # 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n num_evolutions: int\n store_evolutions: bool = False\n include_original_response: bool = False\n mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n seed: RuntimeParameter[int] = Field(\n default=42,\n description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to set a random seed.\",\n )\n\n @override\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `instruction` and `response`.\"\"\"\n return [\"instruction\", \"response\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `evolved_response/s` and the `model_name`.\"\"\"\n # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n # this could be handled always and the value could be included within the DAG validation when\n # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n _outputs = [\n (\"evolved_response\" if not self.store_evolutions else \"evolved_responses\"),\n \"model_name\",\n ]\n\n return _outputs\n\n def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n and, finally, the `model_name`.\n\n Args:\n responses: The responses to be included within the output.\n\n Returns:\n if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n\n if not self.store_evolutions:\n _output[\"evolved_response\"] = responses[-1]\n else:\n _output[\"evolved_responses\"] = responses\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n\n @property\n def mutation_templates_names(self) -> List[str]:\n \"\"\"Returns the names i.e. keys of the provided `mutation_templates` enum.\"\"\"\n return list(self.mutation_templates.keys())\n\n def _apply_random_mutation(self, instruction: str, response: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return (\n self.mutation_templates[mutation]\n .replace(\"<PROMPT>\", instruction)\n .replace(\"<RESPONSE>\", response)\n )\n\n def _evolve_reponses(\n self, inputs: \"StepInput\"\n ) -> Tuple[List[List[str]], Dict[str, Any]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n np.random.seed(self.seed)\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n statistics: Dict[str, Any] = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction, response in zip(instructions, responses):\n formatted_prompts.append(\n self._apply_random_mutation(instruction[-1], response[-1])\n )\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n\n generated_responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n for response in generated_responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n if self.store_evolutions:\n responses = [\n response + [evolved_response[\"generations\"][0]]\n for response, evolved_response in zip(\n responses, generated_responses\n )\n ]\n else:\n responses = [\n [evolved_response[\"generations\"][0]]\n for evolved_response in generated_responses\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n )\n\n return responses, dict(statistics)\n\n @override\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n responses, statistics = self._evolve_reponses(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_responses` list\n from_ = 1 if not self.include_original_response else 0\n responses = [response[from_:] for response in responses]\n\n for input, response in zip(inputs, responses):\n input.update(self.format_output(response))\n input.update(\n {\"distilabel_metadata\": {f\"statistics_{self.name}\": statistics}}\n )\n yield inputs\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\"<PLACEHOLDER_INSTRUCTION>\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.inputs","title":"inputs: List[str] property ","text":"The input for the task are the instruction and response . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.outputs","title":"outputs: List[str] property ","text":"The output for the task are the evolved_response/s and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.mutation_templates_names","title":"mutation_templates_names: List[str] property ","text":"Returns the names i.e. keys of the provided mutation_templates enum. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.model_post_init","title":"model_post_init(__context) ","text":"Override this method to perform additional initialization after __init__ and model_construct . This is useful if you want to do some validation that requires the entire model to be initialized. Source code in src/distilabel/steps/tasks/evol_quality/base.py @override\ndef model_post_init(self, __context: Any) -> None:\n \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n This is useful if you want to do some validation that requires the entire model to be initialized.\n \"\"\"\n super().model_post_init(__context)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists. Source code in src/distilabel/steps/tasks/evol_quality/base.py def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [{\"role\": \"user\", \"content\": input}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_output","title":"format_output(responses) ","text":"The output for the task is a dict with: evolved_response or evolved_responses , depending whether the value is either False or True for store_evolutions , respectively; and, finally, the model_name . Parameters: Name Type Description Default responses Union[str, List[str]] The responses to be included within the output. required Returns: Type Description Dict[str, Any] if store_evolutions=False return {\"evolved_response\": ..., \"model_name\": ...}; Dict[str, Any] if store_evolutions=True return {\"evolved_responses\": ..., \"model_name\": ...}. Source code in src/distilabel/steps/tasks/evol_quality/base.py def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]: # type: ignore\n \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n and, finally, the `model_name`.\n\n Args:\n responses: The responses to be included within the output.\n\n Returns:\n if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n \"\"\"\n _output = {}\n\n if not self.store_evolutions:\n _output[\"evolved_response\"] = responses[-1]\n else:\n _output[\"evolved_responses\"] = responses\n\n _output[\"model_name\"] = self.llm.model_name\n return _output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._apply_random_mutation","title":"_apply_random_mutation(instruction, response) ","text":"Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt. Parameters: Name Type Description Default instruction str The instruction to be included within the mutation prompt. required Returns: Type Description str A random mutation prompt with the provided instruction. Source code in src/distilabel/steps/tasks/evol_quality/base.py def _apply_random_mutation(self, instruction: str, response: str) -> str:\n \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n enum, and returns the provided instruction within the mutation prompt.\n\n Args:\n instruction: The instruction to be included within the mutation prompt.\n\n Returns:\n A random mutation prompt with the provided instruction.\n \"\"\"\n mutation = np.random.choice(self.mutation_templates_names)\n return (\n self.mutation_templates[mutation]\n .replace(\"<PROMPT>\", instruction)\n .replace(\"<RESPONSE>\", response)\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._evolve_reponses","title":"_evolve_reponses(inputs) ","text":"Evolves the instructions provided as part of the inputs of the task. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description List[List[str]] A list where each item is a list with either the last evolved instruction if Dict[str, Any] store_evolutions=False or all the evolved instructions if store_evolutions=True . Source code in src/distilabel/steps/tasks/evol_quality/base.py def _evolve_reponses(\n self, inputs: \"StepInput\"\n) -> Tuple[List[List[str]], Dict[str, Any]]:\n \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list where each item is a list with either the last evolved instruction if\n `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n \"\"\"\n np.random.seed(self.seed)\n instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n statistics: Dict[str, Any] = defaultdict(list)\n\n for iter_no in range(self.num_evolutions):\n formatted_prompts = []\n for instruction, response in zip(instructions, responses):\n formatted_prompts.append(\n self._apply_random_mutation(instruction[-1], response[-1])\n )\n\n formatted_prompts = [\n self.format_input(prompt) for prompt in formatted_prompts\n ]\n\n generated_responses = self.llm.generate(\n formatted_prompts,\n **self.llm.generation_kwargs, # type: ignore\n )\n for response in generated_responses:\n for k, v in response[\"statistics\"].items():\n statistics[k].append(v[0])\n\n if self.store_evolutions:\n responses = [\n response + [evolved_response[\"generations\"][0]]\n for response, evolved_response in zip(\n responses, generated_responses\n )\n ]\n else:\n responses = [\n [evolved_response[\"generations\"][0]]\n for evolved_response in generated_responses\n ]\n\n self._logger.info(\n f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n )\n\n return responses, dict(statistics)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.process","title":"process(inputs) ","text":"Processes the inputs of the task and generates the outputs using the LLM. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Returns: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/evol_quality/base.py @override\ndef process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Returns:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n\n responses, statistics = self._evolve_reponses(inputs)\n\n if self.store_evolutions:\n # Remove the input instruction from the `evolved_responses` list\n from_ = 1 if not self.include_original_response else 0\n responses = [response[from_:] for response in responses]\n\n for input, response in zip(inputs, responses):\n input.update(self.format_output(response))\n input.update(\n {\"distilabel_metadata\": {f\"statistics_{self.name}\": statistics}}\n )\n yield inputs\n\n self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings","title":"GenerateEmbeddings ","text":" Bases: Step Generate embeddings using the last hidden state of an LLM . Generate embeddings for a text input using the last hidden state of an LLM , as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. Attributes: Name Type Description llm LLM The LLM to use to generate the embeddings. Input columns - text (
str , List[Dict[str, str]] ): The input text or conversation to generate embeddings for. Output columns - embedding (
List[float] ): The embedding of the input text or conversation. - model_name (
str ): The model name used to generate the embeddings. Categories References - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
Examples: Rank LLM candidates: from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n)\nembedder.load()\n\nresult = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n)\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/generate_embeddings.py class GenerateEmbeddings(Step):\n \"\"\"Generate embeddings using the last hidden state of an `LLM`.\n\n Generate embeddings for a text input using the last hidden state of an `LLM`, as\n described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n Automatic Data Selection in Instruction Tuning'.\n\n Attributes:\n llm: The `LLM` to use to generate the embeddings.\n\n Input columns:\n - text (`str`, `List[Dict[str, str]]`): The input text or conversation to generate\n embeddings for.\n\n Output columns:\n - embedding (`List[float]`): The embedding of the input text or conversation.\n - model_name (`str`): The model name used to generate the embeddings.\n\n Categories:\n - embedding\n - llm\n\n References:\n - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Rank LLM candidates:\n\n ```python\n from distilabel.steps.tasks import GenerateEmbeddings\n from distilabel.models.llms.huggingface import TransformersLLM\n\n # Consider this as a placeholder for your actual LLM.\n embedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n )\n embedder.load()\n\n result = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n )\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n llm: LLM\n\n def load(self) -> None:\n \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n super().load()\n\n self.llm.load()\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The inputs for the task is a `text` column containing either a string or a\n list of dictionaries in OpenAI chat-like format.\"\"\"\n return [\"text\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs for the task is an `embedding` column containing the embedding of\n the `text` input.\"\"\"\n return [\"embedding\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n can be in `ChatType` format or a string. If a string, it will be converted to a\n list of dictionaries in OpenAI chat-like format.\n\n Args:\n input: The input to format.\n\n Returns:\n The OpenAI chat-like format of the input.\n \"\"\"\n text = input[\"text\"] = input[\"text\"]\n\n # input is in `ChatType` format\n if isinstance(text, str):\n return [{\"role\": \"user\", \"content\": text}]\n\n if is_openai_format(text):\n return text\n\n raise DistilabelUserError(\n f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n page=\"components-gallery/tasks/generateembeddings/\",\n )\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n formatted_inputs = [self.format_input(input) for input in inputs]\n last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n for input, hidden_state in zip(inputs, last_hidden_states):\n input[\"embedding\"] = hidden_state[-1].tolist()\n input[\"model_name\"] = self.llm.model_name\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.inputs","title":"inputs: StepColumns property ","text":"The inputs for the task is a text column containing either a string or a list of dictionaries in OpenAI chat-like format. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.outputs","title":"outputs: StepColumns property ","text":"The outputs for the task is an embedding column containing the embedding of the text input. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.load","title":"load() ","text":"Loads the LLM used to generate the embeddings. Source code in src/distilabel/steps/tasks/generate_embeddings.py def load(self) -> None:\n \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n super().load()\n\n self.llm.load()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.format_input","title":"format_input(input) ","text":"Formats the input to be used by the LLM to generate the embeddings. The input can be in ChatType format or a string. If a string, it will be converted to a list of dictionaries in OpenAI chat-like format. Parameters: Name Type Description Default input Dict[str, Any] The input to format. required Returns: Type Description ChatType The OpenAI chat-like format of the input. Source code in src/distilabel/steps/tasks/generate_embeddings.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n can be in `ChatType` format or a string. If a string, it will be converted to a\n list of dictionaries in OpenAI chat-like format.\n\n Args:\n input: The input to format.\n\n Returns:\n The OpenAI chat-like format of the input.\n \"\"\"\n text = input[\"text\"] = input[\"text\"]\n\n # input is in `ChatType` format\n if isinstance(text, str):\n return [{\"role\": \"user\", \"content\": text}]\n\n if is_openai_format(text):\n return text\n\n raise DistilabelUserError(\n f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n page=\"components-gallery/tasks/generateembeddings/\",\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.process","title":"process(inputs) ","text":"Generates an embedding for each input using the last hidden state of the LLM . Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput A list of Python dictionaries with the outputs of the task. Source code in src/distilabel/steps/tasks/generate_embeddings.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n \"\"\"\n formatted_inputs = [self.format_input(input) for input in inputs]\n last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n for input, hidden_state in zip(inputs, last_hidden_states):\n input[\"embedding\"] = hidden_state[-1].tolist()\n input[\"model_name\"] = self.llm.model_name\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct","title":"Genstruct ","text":" Bases: Task Generate a pair of instruction-response from a document using an LLM . Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper. Note The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - title (
str ): The title of the document. - content (
str ): The content of the document. Output columns - user (
str ): The user's instruction based on the document. - assistant (
str ): The assistant's response based on the user's instruction. - model_name (
str ): The model name used to generate the feedback and result . Categories - text-generation
- instruction
- response
References - Genstruct 7B by Nous Research
- Ada-Instruct: Adapting Instruction Generators for Complex Reasoning
Examples: Generate instructions from raw documents using the title and content: from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n)\n\ngenstruct.load()\n\nresult = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'title': 'An instruction',\n# 'content': 'content of the document',\n# 'model_name': 'test',\n# 'user': 'An instruction',\n# 'assistant': 'content of the document',\n# }\n# ]\n Citations @misc{cui2023adainstructadaptinginstructiongenerators,\n title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n author={Wanyun Cui and Qianle Wang},\n year={2023},\n eprint={2310.04484},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.04484},\n}\n Source code in src/distilabel/steps/tasks/genstruct.py class Genstruct(Task):\n \"\"\"Generate a pair of instruction-response from a document using an `LLM`.\n\n `Genstruct` is a pre-defined task designed to generate valid instructions from a given raw document,\n with the title and the content, enabling the creation of new, partially synthetic instruction finetuning\n datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is\n inspired in the Ada-Instruct paper.\n\n Note:\n The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended\n option is to use `NousResearch/Genstruct-7B` as the LLM provided to the task, since it was trained\n for this specific task.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - title (`str`): The title of the document.\n - content (`str`): The content of the document.\n\n Output columns:\n - user (`str`): The user's instruction based on the document.\n - assistant (`str`): The assistant's response based on the user's instruction.\n - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n Categories:\n - text-generation\n - instruction\n - response\n\n References:\n - [Genstruct 7B by Nous Research](https://huggingface.co/NousResearch/Genstruct-7B)\n - [Ada-Instruct: Adapting Instruction Generators for Complex Reasoning](https://arxiv.org/abs/2310.04484)\n\n Examples:\n Generate instructions from raw documents using the title and content:\n\n ```python\n from distilabel.steps.tasks import Genstruct\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n genstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n )\n\n genstruct.load()\n\n result = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'title': 'An instruction',\n # 'content': 'content of the document',\n # 'model_name': 'test',\n # 'user': 'An instruction',\n # 'assistant': 'content of the document',\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{cui2023adainstructadaptinginstructiongenerators,\n title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n author={Wanyun Cui and Qianle Wang},\n year={2023},\n eprint={2310.04484},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.04484},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"genstruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are the `title` and the `content`.\"\"\"\n return [\"title\", \"content\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n title=input[\"title\"], content=input[\"content\"]\n ),\n }\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `user` instruction based on the provided document\n and the `assistant` response based on the user's instruction.\"\"\"\n return [\"user\", \"assistant\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted so that both the user and the assistant messages are\n captured.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the keys `user` and `assistant` containing the content for each role.\n \"\"\"\n if output is None:\n return {\"user\": None, \"assistant\": None}\n\n matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n if not matches:\n return {\"user\": None, \"assistant\": None}\n\n return {\n \"user\": matches.group(1).strip(),\n \"assistant\": matches.group(2).strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are the title and the content . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.outputs","title":"outputs: List[str] property ","text":"The output for the task are the user instruction based on the provided document and the assistant response based on the user's instruction. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/genstruct.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"genstruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/genstruct.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n title=input[\"title\"], content=input[\"content\"]\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_output","title":"format_output(output, input) ","text":"The output is formatted so that both the user and the assistant messages are captured. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the keys user and assistant containing the content for each role. Source code in src/distilabel/steps/tasks/genstruct.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted so that both the user and the assistant messages are\n captured.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the keys `user` and `assistant` containing the content for each role.\n \"\"\"\n if output is None:\n return {\"user\": None, \"assistant\": None}\n\n matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n if not matches:\n return {\"user\": None, \"assistant\": None}\n\n return {\n \"user\": matches.group(1).strip(),\n \"assistant\": matches.group(2).strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator","title":"BitextRetrievalGenerator ","text":" Bases: _EmbeddingDataGenerator Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Attributes: Name Type Description source_language str The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. target_language str The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. unit Optional[Literal['sentence', 'phrase', 'passage']] The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['elementary school', 'high school', 'college']] The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. high_score Optional[Literal['4', '4.5', '5']] The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. low_score Optional[Literal['2.5', '3', '3.5']] The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['2.5', '3', '3.5']] The random seed to be set in case there's any sampling within the format_input method. Output columns - S1 (
str ): the first sentence generated by the LLM . - S2 (
str ): the second sentence generated by the LLM . - S3 (
str ): the third sentence generated by the LLM . - model_name (
str ): the name of the model used to generate the bitext retrieval data. Examples: Generate bitext retrieval data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class BitextRetrievalGenerator(_EmbeddingDataGenerator):\n \"\"\"Generate bitext retrieval data with an `LLM` to later on train an embedding model.\n\n `BitextRetrievalGenerator` is a `GeneratorTask` that generates bitext retrieval data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Attributes:\n source_language: The source language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n target_language: The target language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n Defaults to `None`, meaning that it will be randomly sampled.\n high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Output columns:\n - S1 (`str`): the first sentence generated by the `LLM`.\n - S2 (`str`): the second sentence generated by the `LLM`.\n - S3 (`str`): the third sentence generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the bitext retrieval\n data.\n\n Examples:\n Generate bitext retrieval data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import BitextRetrievalGenerator\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n ```\n \"\"\"\n\n source_language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n target_language: str = Field(\n default=...,\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n _template_name: str = PrivateAttr(default=\"bitext-retrieval\")\n _can_be_used_with_offline_batch_generation = True\n\n @property\n def prompt(self) -> ChatType:\n \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n being from the user with the content being the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n source_language=self.source_language,\n target_language=self.target_language,\n unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n difficulty=self.difficulty\n or random.choice([\"elementary school\", \"high school\", \"college\"]),\n high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n ).strip(),\n }\n ] # type: ignore\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"S1\", \"S2\", \"S3\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.prompt","title":"prompt: ChatType property ","text":"Contains the prompt to be used in the process method, rendering the _template ; and formatted as an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData","title":"GenerateLongTextMatchingData ","text":" Bases: _EmbeddingDataGeneration Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. seed str The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. Input columns - task (
str ): The task description to be used in the generation. Output columns - input (
str ): the input generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - model_name (
str ): the name of the model used to generate the long text matching data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic long text matching data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateLongTextMatchingData(_EmbeddingDataGeneration):\n \"\"\"Generate long text matching data with an `LLM` to later on train an embedding model.\n\n `GenerateLongTextMatchingData` is a `Task` that generates long text matching data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-matching-long\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-matching-long category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n Note that in this task the `seed` has no effect since there are no sampling params.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input (`str`): the input generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the long text matching\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic long text matching data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n _template_name: str = PrivateAttr(default=\"long-text-matching\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input\", \"positive_document\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData","title":"GenerateShortTextMatchingData ","text":" Bases: _EmbeddingDataGeneration Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. seed str The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. Input columns - task (
str ): The task description to be used in the generation. Output columns - input (
str ): the input generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - model_name (
str ): the name of the model used to generate the short text matching data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic short text matching data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateShortTextMatchingData(_EmbeddingDataGeneration):\n \"\"\"Generate short text matching data with an `LLM` to later on train an embedding model.\n\n `GenerateShortTextMatchingData` is a `Task` that generates short text matching data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-matching-short\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-matching-short category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n Note that in this task the `seed` has no effect since there are no sampling params.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input (`str`): the input generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the short text matching\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic short text matching data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n _template_name: str = PrivateAttr(default=\"short-text-matching\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input\", \"positive_document\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData","title":"GenerateTextClassificationData ","text":" Bases: _EmbeddingDataGeneration Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. difficulty Optional[Literal['high school', 'college', 'PhD']] The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The random seed to be set in case there's any sampling within the format_input method. Input columns - task (
str ): The task description to be used in the generation. Output columns - input_text (
str ): the input text generated by the LLM . - label (
str ): the label generated by the LLM . - misleading_label (
str ): the misleading label generated by the LLM . - model_name (
str ): the name of the model used to generate the text classification data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic text classification data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateTextClassificationData(_EmbeddingDataGeneration):\n \"\"\"Generate text classification data with an `LLM` to later on train an embedding model.\n\n `GenerateTextClassificationData` is a `Task` that generates text classification data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-classification\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-classification category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n Defaults to `None`, meaning that it will be randomly sampled.\n clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - input_text (`str`): the input text generated by the `LLM`.\n - label (`str`): the label generated by the `LLM`.\n - misleading_label (`str`): the misleading label generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the text classification\n data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic text classification data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n clarity: Optional[\n Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ] = None\n\n _template_name: str = PrivateAttr(default=\"text-classification\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"input_text\", \"label\", \"misleading_label\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData","title":"GenerateTextRetrievalData ","text":" Bases: _EmbeddingDataGeneration Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Note Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. query_type Optional[Literal['extremely long-tail', 'long-tail', 'common']] The type of query to be generated, which can be extremely long-tail , long-tail , or common . Defaults to None , meaning that it will be randomly sampled. query_length Optional[Literal['less than 5 words', '5 to 15 words', 'at least 10 words']] The length of the query to be generated, which can be less than 5 words , 5 to 15 words , or at least 10 words . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['high school', 'college', 'PhD']] The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']] The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. num_words Optional[Literal[50, 100, 200, 300, 400, 500]] The number of words in the query to be generated, which can be 50 , 100 , 200 , 300 , 400 , or 500 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal[50, 100, 200, 300, 400, 500]] The random seed to be set in case there's any sampling within the format_input method. Input columns - task (
str ): The task description to be used in the generation. Output columns - user_query (
str ): the user query generated by the LLM . - positive_document (
str ): the positive document generated by the LLM . - hard_negative_document (
str ): the hard negative document generated by the LLM . - model_name (
str ): the name of the model used to generate the text retrieval data. References - Improving Text Embeddings with Large Language Models
Examples: Generate synthetic text retrieval data for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class GenerateTextRetrievalData(_EmbeddingDataGeneration):\n \"\"\"Generate text retrieval data with an `LLM` to later on train an embedding model.\n\n `GenerateTextRetrievalData` is a `Task` that generates text retrieval data with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Note:\n Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n with the `category=\"text-retrieval\"`; so that the `LLM` generates a list of tasks that\n are flattened so that each row contains a single task for the text-retrieval category.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n query_type: The type of query to be generated, which can be `extremely long-tail`, `long-tail`,\n or `common`. Defaults to `None`, meaning that it will be randomly sampled.\n query_length: The length of the query to be generated, which can be `less than 5 words`, `5 to 15 words`,\n or `at least 10 words`. Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n Defaults to `None`, meaning that it will be randomly sampled.\n clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n num_words: The number of words in the query to be generated, which can be `50`, `100`, `200`, `300`, `400`, or `500`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Input columns:\n - task (`str`): The task description to be used in the generation.\n\n Output columns:\n - user_query (`str`): the user query generated by the `LLM`.\n - positive_document (`str`): the positive document generated by the `LLM`.\n - hard_negative_document (`str`): the hard negative document generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the text retrieval data.\n\n References:\n - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n Examples:\n Generate synthetic text retrieval data for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n query_type: Optional[Literal[\"extremely long-tail\", \"long-tail\", \"common\"]] = None\n query_length: Optional[\n Literal[\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ] = None\n difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n clarity: Optional[\n Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ] = None\n num_words: Optional[Literal[50, 100, 200, 300, 400, 500]] = None\n\n _template_name: str = PrivateAttr(default=\"text-retrieval\")\n _can_be_used_with_offline_batch_generation = True\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n query_type=self.query_type\n or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n query_length=self.query_length\n or random.choice(\n [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ),\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n num_words=self.num_words\n or random.choice([50, 100, 200, 300, 400, 500]),\n ).strip(),\n }\n ]\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\n \"user_query\",\n \"positive_document\",\n \"hard_negative_document\",\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.format_input","title":"format_input(input) ","text":"Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . Parameters: Name Type Description Default input Dict[str, Any] The input dictionary containing the task to be used in the _template . required Returns: Type Description ChatType A list with a single chat containing the user's message with the rendered _template . Source code in src/distilabel/steps/tasks/improving_text_embeddings.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n randomly sampling those if not provided. This method will render the `_template` with\n the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n there's only one turn, being from the user with the content being the rendered `_template`.\n\n Args:\n input: The input dictionary containing the `task` to be used in the `_template`.\n\n Returns:\n A list with a single chat containing the user's message with the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n task=input[\"task\"],\n language=self.language,\n query_type=self.query_type\n or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n query_length=self.query_length\n or random.choice(\n [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n ),\n difficulty=self.difficulty\n or random.choice([\"high school\", \"college\", \"PhD\"]),\n clarity=self.clarity\n or random.choice(\n [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n ),\n num_words=self.num_words\n or random.choice([50, 100, 200, 300, 400, 500]),\n ).strip(),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator","title":"MonolingualTripletGenerator ","text":" Bases: _EmbeddingDataGenerator Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. Attributes: Name Type Description language str The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. unit Optional[Literal['sentence', 'phrase', 'passage']] The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. difficulty Optional[Literal['elementary school', 'high school', 'college']] The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. high_score Optional[Literal['4', '4.5', '5']] The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. low_score Optional[Literal['2.5', '3', '3.5']] The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. seed Optional[Literal['2.5', '3', '3.5']] The random seed to be set in case there's any sampling within the format_input method. Output columns - S1 (
str ): the first sentence generated by the LLM . - S2 (
str ): the second sentence generated by the LLM . - S3 (
str ): the third sentence generated by the LLM . - model_name (
str ): the name of the model used to generate the monolingual triplets. Examples: Generate monolingual triplets for training embedding models: from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n Source code in src/distilabel/steps/tasks/improving_text_embeddings.py class MonolingualTripletGenerator(_EmbeddingDataGenerator):\n \"\"\"Generate monolingual triplets with an `LLM` to later on train an embedding model.\n\n `MonolingualTripletGenerator` is a `GeneratorTask` that generates monolingual triplets with an\n `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n Text Embeddings with Large Language Models\" and the data is generated based on the\n provided attributes, or randomly sampled if not provided.\n\n Attributes:\n language: The language of the data to be generated, which can be any of the languages\n retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n Defaults to `None`, meaning that it will be randomly sampled.\n difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n Defaults to `None`, meaning that it will be randomly sampled.\n high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n Defaults to `None`, meaning that it will be randomly sampled.\n seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n Output columns:\n - S1 (`str`): the first sentence generated by the `LLM`.\n - S2 (`str`): the second sentence generated by the `LLM`.\n - S3 (`str`): the third sentence generated by the `LLM`.\n - model_name (`str`): the name of the model used to generate the monolingual triplets.\n\n Examples:\n Generate monolingual triplets for training embedding models:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps.tasks import MonolingualTripletGenerator\n\n with Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n ```\n \"\"\"\n\n language: str = Field(\n default=\"English\",\n description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n )\n\n unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n _template_name: str = PrivateAttr(default=\"monolingual-triplet\")\n _can_be_used_with_offline_batch_generation = True\n\n @property\n def prompt(self) -> ChatType:\n \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n being from the user with the content being the rendered `_template`.\n \"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n language=self.language,\n unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n difficulty=self.difficulty\n or random.choice([\"elementary school\", \"high school\", \"college\"]),\n high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n ).strip(),\n }\n ] # type: ignore\n\n @property\n def keys(self) -> List[str]:\n \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n return [\"S1\", \"S2\", \"S3\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.prompt","title":"prompt: ChatType property ","text":"Contains the prompt to be used in the process method, rendering the _template ; and formatted as an OpenAI formatted chat i.e. a ChatType , assuming that there's only one turn, being from the user with the content being the rendered _template . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.keys","title":"keys: List[str] property ","text":"Contains the keys that will be parsed from the LLM output into a Python dict. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation","title":"InstructionBacktranslation ","text":" Bases: Task Self-Alignment with Instruction Backtranslation. Attributes: Name Type Description _template Optional[Template] the Jinja2 template to use for the Instruction Backtranslation task. Input columns - instruction (
str ): The reference instruction to evaluate the text output. - generation (
str ): The text output to evaluate for the given instruction. Output columns - score (
str ): The score for the generation based on the given instruction. - reason (
str ): The reason for the provided score. - model_name (
str ): The model name used to score the generation. Categories References Self-Alignment with Instruction Backtranslation Examples: Generate a score and reason for a given instruction and generation: from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\ninstruction_backtranslation.load()\n\nresult = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"How much is 2+2?\",\n# \"generation\": \"4\",\n# \"score\": 3,\n# \"reason\": \"Reason for the generation.\",\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n# }\n# ]\n Citations @misc{li2024selfalignmentinstructionbacktranslation,\n title={Self-Alignment with Instruction Backtranslation},\n author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n year={2024},\n eprint={2308.06259},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2308.06259},\n}\n Source code in src/distilabel/steps/tasks/instruction_backtranslation.py class InstructionBacktranslation(Task):\n \"\"\"Self-Alignment with Instruction Backtranslation.\n\n Attributes:\n _template: the Jinja2 template to use for the Instruction Backtranslation task.\n\n Input columns:\n - instruction (`str`): The reference instruction to evaluate the text output.\n - generation (`str`): The text output to evaluate for the given instruction.\n\n Output columns:\n - score (`str`): The score for the generation based on the given instruction.\n - reason (`str`): The reason for the provided score.\n - model_name (`str`): The model name used to score the generation.\n\n Categories:\n - critique\n\n References:\n - [`Self-Alignment with Instruction Backtranslation`](https://arxiv.org/abs/2308.06259)\n\n Examples:\n Generate a score and reason for a given instruction and generation:\n\n ```python\n from distilabel.steps.tasks import InstructionBacktranslation\n\n instruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\n instruction_backtranslation.load()\n\n result = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"How much is 2+2?\",\n # \"generation\": \"4\",\n # \"score\": 3,\n # \"reason\": \"Reason for the generation.\",\n # \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{li2024selfalignmentinstructionbacktranslation,\n title={Self-Alignment with Instruction Backtranslation},\n author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n year={2024},\n eprint={2308.06259},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2308.06259},\n }\n ```\n \"\"\"\n\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"instruction-backtranslation.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`, and the `generation` for it.\"\"\"\n return [\"instruction\", \"generation\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generation=input[\"generation\"]\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `score`, `reason` and the `model_name`.\"\"\"\n return [\"score\", \"reason\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n `model_name` will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing the `score` and the `reason` for the provided `score`.\n \"\"\"\n pattern = r\"(.+?)Score: (\\d)\"\n\n matches = None\n if output is not None:\n matches = re.findall(pattern, output, re.DOTALL)\n if matches is None:\n return {\"score\": None, \"reason\": None}\n\n return {\n \"score\": int(matches[0][1]),\n \"reason\": matches[0][0].strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction , and the generation for it. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.outputs","title":"outputs: List[str] property ","text":"The output for the task is the score , reason and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"instruction-backtranslation.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generation=input[\"generation\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dictionary with the score and reason . The model_name will be automatically included within the process method of Task . Parameters: Name Type Description Default output Union[str, None] a string representing the output of the LLM via the process method. required input Dict[str, Any] the input to the task, as required by some tasks to format the output. required Returns: Type Description Dict[str, Any] A dictionary containing the score and the reason for the provided score . Source code in src/distilabel/steps/tasks/instruction_backtranslation.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n `model_name` will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing the `score` and the `reason` for the provided `score`.\n \"\"\"\n pattern = r\"(.+?)Score: (\\d)\"\n\n matches = None\n if output is not None:\n matches = re.findall(pattern, output, re.DOTALL)\n if matches is None:\n return {\"score\": None, \"reason\": None}\n\n return {\n \"score\": int(matches[0][1]),\n \"reason\": matches[0][0].strip(),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie","title":"Magpie ","text":" Bases: Task , MagpieBase Generates conversations using an instruct fine-tuned LLM. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. Attributes: Name Type Description n_turns the number of turns that the generated conversation will have. Defaults to 1 . end_with_user whether the conversation should end with a user message. Defaults to False . include_system_prompt whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . Runtime parameters n_turns : the number of turns that the generated conversation will have. Defaults to 1 . end_with_user : whether the conversation should end with a user message. Defaults to False . include_system_prompt : whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction : whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt : an optional system prompt or list of system prompts that can be used to steer the LLM to generate content of certain topic, guide the style, etc. If it's a list of system prompts, then a random system prompt will be chosen per input/output batch. If the provided inputs contains a system_prompt column, then this runtime parameter will be ignored and the one from the column will be used. Defaults to None . system_prompt : an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Input columns - system_prompt (
str , optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic. Output columns - conversation (
ChatType ): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False . - instruction (
str ): the generated instructions if only_instruction=True or n_turns==1 . - response (
str ): the generated response if n_turns==1 . - system_prompt_key (
str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. - model_name (
str ): The model name used to generate the conversation or instruction . Categories - text-generation
- instruction
References - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
Examples: Generating instructions with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n# {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n Generating conversations with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n# {\n# 'role': 'user',\n# 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n# {\n# 'role': 'user',\n# 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n# }\n# ]\n# }\n# ]\n Source code in src/distilabel/steps/tasks/magpie/base.py class Magpie(Task, MagpieBase):\n \"\"\"Generates conversations using an instruct fine-tuned LLM.\n\n Magpie is a neat method that allows generating user instructions with no seed data\n or specific system prompt thanks to the autoregressive capabilities of the instruct\n fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n LLM without any user message, then the LLM will continue generating tokens as if it was\n the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n After this instruct is generated, it can be sent again to the LLM to generate this time\n an assistant response. This process can be repeated N times allowing to build a multi-turn\n conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n Scratch by Prompting Aligned LLMs with Nothing'.\n\n Attributes:\n n_turns: the number of turns that the generated conversation will have.\n Defaults to `1`.\n end_with_user: whether the conversation should end with a user message.\n Defaults to `False`.\n include_system_prompt: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n only_instruction: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n system_prompt: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic. Defaults to `None`.\n\n Runtime parameters:\n - `n_turns`: the number of turns that the generated conversation will have. Defaults\n to `1`.\n - `end_with_user`: whether the conversation should end with a user message.\n Defaults to `False`.\n - `include_system_prompt`: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n - `only_instruction`: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n - `system_prompt`: an optional system prompt or list of system prompts that can\n be used to steer the LLM to generate content of certain topic, guide the style,\n etc. If it's a list of system prompts, then a random system prompt will be chosen\n per input/output batch. If the provided inputs contains a `system_prompt` column,\n then this runtime parameter will be ignored and the one from the column will\n be used. Defaults to `None`.\n - `system_prompt`: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic.\n\n Input columns:\n - system_prompt (`str`, optional): an optional system prompt that can be provided\n to guide the generation of the instruct LLM and steer it to generate instructions\n of certain topic.\n\n Output columns:\n - conversation (`ChatType`): the generated conversation which is a list of chat\n items with a role and a message. Only if `only_instruction=False`.\n - instruction (`str`): the generated instructions if `only_instruction=True` or `n_turns==1`.\n - response (`str`): the generated response if `n_turns==1`.\n - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n the conversation or instruction. Only if `system_prompt` is a dictionary.\n - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n Categories:\n - text-generation\n - instruction\n\n References:\n - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n Examples:\n Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import Magpie\n\n magpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n )\n\n magpie.load()\n\n result = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n )\n # [\n # {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n # {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n # ]\n ```\n\n Generating conversations with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import Magpie\n\n magpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n )\n\n magpie.load()\n\n result = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n )\n # [\n # {\n # 'conversation': [\n # {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n # {\n # 'role': 'user',\n # 'content': 'I\\'m having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n # of x\".'\n # },\n # {\n # 'role': 'assistant',\n # 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don\\'t worry, I\\'m here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n # x\". What it\\'s asking us to do is find the'\n # }\n # ]\n # },\n # {\n # 'conversation': [\n # {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n # {\n # 'role': 'user',\n # 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n # might be pests or diseases, but I'm not sure which.\"\n # },\n # {\n # 'role': 'assistant',\n # 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n # **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n # }\n # ]\n # }\n # ]\n ```\n \"\"\"\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpie/\",\n )\n\n self.llm.use_magpie_template = True\n\n @property\n def inputs(self) -> \"StepColumns\":\n return {\"system_prompt\": False}\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Does nothing.\"\"\"\n return []\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n outputs = []\n\n if self.only_instruction:\n outputs.append(\"instruction\")\n elif self.n_turns == 1:\n outputs.extend([\"instruction\", \"response\"])\n else:\n outputs.append(\"conversation\")\n\n if isinstance(self.system_prompt, dict):\n outputs.append(\"system_prompt_key\")\n\n outputs.append(\"model_name\")\n\n return outputs\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n Args:\n inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n Yields:\n The list of generated conversations.\n \"\"\"\n yield self._generate_with_pre_query_template(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.outputs","title":"outputs: StepColumns property ","text":"Either a multi-turn conversation or the instruction generated. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.model_post_init","title":"model_post_init(__context) ","text":"Checks that the provided LLM uses the MagpieChatTemplateMixin . Source code in src/distilabel/steps/tasks/magpie/base.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpie/\",\n )\n\n self.llm.use_magpie_template = True\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_input","title":"format_input(input) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/base.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Does nothing.\"\"\"\n return []\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/base.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.process","title":"process(inputs) ","text":"Generate a list of instructions or conversations of the specified number of turns. Parameters: Name Type Description Default inputs StepInput a list of dictionaries that can contain a system_prompt key. required Yields: Type Description StepOutput The list of generated conversations. Source code in src/distilabel/steps/tasks/magpie/base.py def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n Args:\n inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n Yields:\n The list of generated conversations.\n \"\"\"\n yield self._generate_with_pre_query_template(inputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator","title":"MagpieGenerator ","text":" Bases: GeneratorTask , MagpieBase Generator task the generates instructions or conversations using Magpie. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. Attributes: Name Type Description n_turns the number of turns that the generated conversation will have. Defaults to 1 . end_with_user whether the conversation should end with a user message. Defaults to False . include_system_prompt whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . num_rows RuntimeParameter[int] the number of rows to be generated. Runtime parameters n_turns : the number of turns that the generated conversation will have. Defaults to 1 . end_with_user : whether the conversation should end with a user message. Defaults to False . include_system_prompt : whether to include the system prompt used in the generated conversation. Defaults to False . only_instruction : whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . system_prompt : an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. num_rows : the number of rows to be generated. Output columns - conversation (
ChatType ): the generated conversation which is a list of chat items with a role and a message. - instruction (
str ): the generated instructions if only_instruction=True . - response (
str ): the generated response if n_turns==1 . - system_prompt_key (
str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. - model_name (
str ): The model name used to generate the conversation or instruction . Categories - text-generation
- instruction
- generator
References - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
Examples: Generating instructions with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n# {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n# ],\n# True\n# )\n Generating a conversation with Llama 3 8B Instruct and TransformersLLM: from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n# {\n# 'role': 'assistant',\n# 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n# },\n# {\n# 'role': 'user',\n# 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n# {\n# 'role': 'assistant',\n# 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n# },\n# {\n# 'role': 'user',\n# 'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n# }\n# ]\n# }\n# ],\n# True\n# )\n Generating with system prompts with probabilities: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n Citations @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n}\n Source code in src/distilabel/steps/tasks/magpie/generator.py class MagpieGenerator(GeneratorTask, MagpieBase):\n \"\"\"Generator task the generates instructions or conversations using Magpie.\n\n Magpie is a neat method that allows generating user instructions with no seed data\n or specific system prompt thanks to the autoregressive capabilities of the instruct\n fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n LLM without any user message, then the LLM will continue generating tokens as it was\n the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n After this instruct is generated, it can be sent again to the LLM to generate this time\n an assistant response. This process can be repeated N times allowing to build a multi-turn\n conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n Scratch by Prompting Aligned LLMs with Nothing'.\n\n Attributes:\n n_turns: the number of turns that the generated conversation will have.\n Defaults to `1`.\n end_with_user: whether the conversation should end with a user message.\n Defaults to `False`.\n include_system_prompt: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n only_instruction: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n system_prompt: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic. Defaults to `None`.\n num_rows: the number of rows to be generated.\n\n Runtime parameters:\n - `n_turns`: the number of turns that the generated conversation will have. Defaults\n to `1`.\n - `end_with_user`: whether the conversation should end with a user message.\n Defaults to `False`.\n - `include_system_prompt`: whether to include the system prompt used in the generated\n conversation. Defaults to `False`.\n - `only_instruction`: whether to generate only the instruction. If this argument is\n `True`, then `n_turns` will be ignored. Defaults to `False`.\n - `system_prompt`: an optional system prompt, or a list of system prompts from which\n a random one will be chosen, or a dictionary of system prompts from which a\n random one will be choosen, or a dictionary of system prompts with their probability\n of being chosen. The random system prompt will be chosen per input/output batch.\n This system prompt can be used to guide the generation of the instruct LLM and\n steer it to generate instructions of a certain topic.\n - `num_rows`: the number of rows to be generated.\n\n Output columns:\n - conversation (`ChatType`): the generated conversation which is a list of chat\n items with a role and a message.\n - instruction (`str`): the generated instructions if `only_instruction=True`.\n - response (`str`): the generated response if `n_turns==1`.\n - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n the conversation or instruction. Only if `system_prompt` is a dictionary.\n - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n Categories:\n - text-generation\n - instruction\n - generator\n\n References:\n - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n Examples:\n Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n generator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n )\n\n generator.load()\n\n result = next(generator.process())\n # (\n # [\n # {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n # {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n # ],\n # True\n # )\n ```\n\n Generating a conversation with Llama 3 8B Instruct and TransformersLLM:\n\n ```python\n from distilabel.models import TransformersLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n generator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n )\n\n generator.load()\n\n result = next(generator.process())\n # (\n # [\n # {\n # 'conversation': [\n # {\n # 'role': 'system',\n # 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n # insightful responses to help the user with their queries.'\n # },\n # {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n # {\n # 'role': 'assistant',\n # 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n # let's break down the basics. First, we need to identify your goals and target audience. What do\"\n # },\n # {\n # 'role': 'user',\n # 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n # expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n # },\n # {\n # 'role': 'assistant',\n # 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n # agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n # }\n # ]\n # },\n # {\n # 'conversation': [\n # {\n # 'role': 'system',\n # 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n # insightful responses to help the user with their queries.'\n # },\n # {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n # {\n # 'role': 'assistant',\n # 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n # **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n # },\n # {\n # 'role': 'user',\n # 'content': 'Let me stop you there. Let\\'s explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I\\'re primarily using my\n # laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n # },\n # {\n # 'role': 'assistant',\n # 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n # option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n # }\n # ]\n # }\n # ],\n # True\n # )\n ```\n\n Generating with system prompts with probabilities:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps.tasks import MagpieGenerator\n\n magpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n )\n\n magpie.load()\n\n result = next(magpie.process())\n ```\n\n Citations:\n ```\n @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n }\n ```\n \"\"\"\n\n # TODO: move this to `GeneratorTask`\n num_rows: RuntimeParameter[int] = Field(\n default=None, description=\"The number of rows to generate.\"\n )\n\n def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpiegenerator/\",\n )\n\n self.llm.use_magpie_template = True\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n outputs = []\n\n if self.only_instruction:\n outputs.append(\"instruction\")\n elif self.n_turns == 1:\n outputs.extend([\"instruction\", \"response\"])\n else:\n outputs.append(\"conversation\")\n\n if isinstance(self.system_prompt, dict):\n outputs.append(\"system_prompt_key\")\n\n outputs.append(\"model_name\")\n\n return outputs\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n The generated instructions or conversations.\n \"\"\"\n generated = offset\n\n while generated <= self.num_rows: # type: ignore\n rows_to_generate = (\n self.num_rows if self.num_rows < self.batch_size else self.batch_size # type: ignore\n )\n conversations = self._generate_with_pre_query_template(\n inputs=[{} for _ in range(rows_to_generate)] # type: ignore\n )\n generated += rows_to_generate # type: ignore\n yield (conversations, generated == self.num_rows)\n\n @override\n def _sample_input(self) -> \"ChatType\":\n return self._generate_with_pre_query_template(inputs=[{}])\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.outputs","title":"outputs: StepColumns property ","text":"Either a multi-turn conversation or the instruction generated. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.model_post_init","title":"model_post_init(__context) ","text":"Checks that the provided LLM uses the MagpieChatTemplateMixin . Source code in src/distilabel/steps/tasks/magpie/generator.py def model_post_init(self, __context: Any) -> None:\n \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n super().model_post_init(__context)\n\n if not isinstance(self.llm, MagpieChatTemplateMixin):\n raise DistilabelUserError(\n f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n page=\"components-gallery/tasks/magpiegenerator/\",\n )\n\n self.llm.use_magpie_template = True\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/magpie/generator.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.process","title":"process(offset=0) ","text":"Generates the desired number of instructions or conversations using Magpie. Parameters: Name Type Description Default offset int The offset to start the generation from. Defaults to 0 . 0 Yields: Type Description GeneratorStepOutput The generated instructions or conversations. Source code in src/distilabel/steps/tasks/magpie/generator.py def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n Args:\n offset: The offset to start the generation from. Defaults to `0`.\n\n Yields:\n The generated instructions or conversations.\n \"\"\"\n generated = offset\n\n while generated <= self.num_rows: # type: ignore\n rows_to_generate = (\n self.num_rows if self.num_rows < self.batch_size else self.batch_size # type: ignore\n )\n conversations = self._generate_with_pre_query_template(\n inputs=[{} for _ in range(rows_to_generate)] # type: ignore\n )\n generated += rows_to_generate # type: ignore\n yield (conversations, generated == self.num_rows)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter","title":"MathShepherdCompleter ","text":" Bases: Task Math Shepherd Completer and auto-labeller task. This task is in charge of, given a list of solutions to an instruction, and a golden solution, as reference, generate completions for the solutions, and label them according to the golden solution using the hard estimation method from figure 2 in the reference paper, Eq. 3. The attributes make the task flexible to be used with different types of dataset and LLMs, and allow making use of different fields to modify the system and user prompts for it. Before modifying them, review the current defaults to ensure the completions are generated correctly. Attributes: Name Type Description system_prompt Optional[str] The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. extra_rules Optional[str] This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. few_shots Optional[str] Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. N PositiveInt Number of completions to generate for each step, correspond to N in the paper. They used 8 in the paper, but it can be adjusted. tags list[str] List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the paper, where the first is used as a positive label, and the second as a negative one. This can be updated, but it MUST be a list with 2 elements, where the first is the positive one, and the second the negative one. Input columns - instruction (
str ): The task or instruction. - solutions (
List[str] ): List of solutions to the task. - golden_solution (
str ): The reference solution to the task, will be used to annotate the candidate solutions. Output columns - solutions (
List[str] ): The same columns that were used as input, the \"solutions\" is modified. - model_name (
str ): The name of the model used to generate the revision. Categories - text-generation
- labelling
References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations Examples: Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way): from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Annotate your steps with the Math Shepherd Completer: from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Citations: ```\n@misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n}\n```\n Source code in src/distilabel/steps/tasks/math_shepherd/completer.py class MathShepherdCompleter(Task):\n \"\"\"Math Shepherd Completer and auto-labeller task.\n\n This task is in charge of, given a list of solutions to an instruction, and a golden solution,\n as reference, generate completions for the solutions, and label them according to the golden\n solution using the hard estimation method from figure 2 in the reference paper, Eq. 3.\n The attributes make the task flexible to be used with different types of dataset and LLMs, and\n allow making use of different fields to modify the system and user prompts for it. Before modifying\n them, review the current defaults to ensure the completions are generated correctly.\n\n Attributes:\n system_prompt: The system prompt to be used in the completions. The default one has been\n checked and generates good completions using Llama 3.1 with 8B and 70B,\n but it can be modified to adapt it to the model and dataset selected.\n extra_rules: This field can be used to insert extra rules relevant to the type of dataset.\n For example, in the original paper they used GSM8K and MATH datasets, and this field\n can be used to insert the rules for the GSM8K dataset.\n few_shots: Few shots to help the model generating the completions, write them in the\n format of the type of solutions wanted for your dataset.\n N: Number of completions to generate for each step, correspond to N in the paper.\n They used 8 in the paper, but it can be adjusted.\n tags: List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the\n paper, where the first is used as a positive label, and the second as a negative one.\n This can be updated, but it MUST be a list with 2 elements, where the first is the\n positive one, and the second the negative one.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n - solutions (`List[str]`): List of solutions to the task.\n - golden_solution (`str`): The reference solution to the task, will be used\n to annotate the candidate solutions.\n\n Output columns:\n - solutions (`List[str]`): The same columns that were used as input, the \"solutions\" is modified.\n - model_name (`str`): The name of the model used to generate the revision.\n\n Categories:\n - text-generation\n - labelling\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n\n Examples:\n Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way):\n\n ```python\n from distilabel.steps.tasks import MathShepherdCompleter\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"],\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n\n Annotate your steps with the Math Shepherd Completer:\n\n ```python\n from distilabel.steps.tasks import MathShepherdCompleter\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdCompleter(\n llm=llm,\n N=3\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"],\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n\n Citations:\n\n ```\n @misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n }\n ```\n \"\"\"\n\n system_prompt: Optional[str] = SYSTEM_PROMPT\n extra_rules: Optional[str] = RULES_GSM8K\n few_shots: Optional[str] = FEW_SHOTS_GSM8K\n N: PositiveInt = 1\n tags: list[str] = [\"+\", \"-\"]\n\n def load(self) -> None:\n super().load()\n\n if self.system_prompt is not None:\n self.system_prompt = Template(self.system_prompt).render(\n extra_rules=self.extra_rules or \"\",\n few_shots=self.few_shots or \"\",\n structured_prompt=SYSTEM_PROMPT_STRUCTURED\n if self.use_default_structured_output\n else \"\",\n )\n if self.use_default_structured_output:\n self._template = Template(TEMPLATE_STRUCTURED)\n else:\n self._template = Template(TEMPLATE)\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\", \"solutions\", \"golden_solution\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n instruction=input[\"instruction\"], N=self.N\n ),\n }\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages # type: ignore\n\n def _parse_output(self, output: Union[str, None]) -> list[list[str]]:\n if output is None:\n return [[\"\"]] * self.N\n\n if self.N > 1:\n output_transformed = ( # type: ignore\n self._format_structured_output(output)\n if self.use_default_structured_output\n else output.split(\"---\")\n )\n examples = [split_solution_steps(o) for o in output_transformed]\n # In case there aren't the expected number of completions, we fill it with \"\", or short the list.\n # This shoulnd't happen if the LLM works as expected, but it's a safety measure as it can be\n # difficult to debug if the completions don't match the solutions.\n if len(examples) < self.N:\n examples.extend([\"\"] * (self.N - len(examples))) # type: ignore\n elif len(examples) > self.N:\n examples = examples[: self.N]\n else:\n output_transformed = (\n self._format_structured_output(output)[0]\n if self.use_default_structured_output\n else output\n )\n examples = [split_solution_steps(output_transformed)]\n return examples\n\n def _format_structured_output(self, output: str) -> list[str]:\n default_output = [\"\"] * self.N if self.N else [\"\"]\n if parsed_output := parse_json_response(output):\n solutions = parsed_output[\"solutions\"]\n extracted_solutions = [solution[\"solution\"] for solution in solutions]\n if len(output) != self.N:\n extracted_solutions = default_output\n return extracted_solutions\n return default_output\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Does the processing of generation completions for the solutions, and annotate\n each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)).\n\n Args:\n inputs: Inputs to the step\n\n Yields:\n Annotated inputs with the completions.\n \"\"\"\n\n # A list with all the inputs to be passed to the LLM. Needs another structure to\n # find them afterwards\n prepared_inputs = []\n # Data structure with the indices of the elements.\n # (i, j, k) where i is the input, j is the solution, and k is the completion\n input_positions = []\n golden_answers = []\n for i, input in enumerate(inputs):\n instruction = input[\"instruction\"]\n golden_solution = input[\"golden_solution\"] # This is a single solution\n golden_answers.append(golden_solution[-1])\n # This contains a list of solutions\n solutions = input[\"solutions\"]\n for j, solution in enumerate(solutions):\n # For each solution, that has K steps, we have to generate N completions\n # for the first K-2 steps (-2 because the last 2 steps are the last step, and\n # the answer itself, which can be directly compared against golden answer)\n prepared_completions = self._prepare_completions(instruction, solution)\n prepared_inputs.extend(prepared_completions)\n input_positions.extend(\n [(i, j, k) for k in range(len(prepared_completions))]\n )\n\n # Send the elements in batches to the LLM to speed up the process\n final_outputs = []\n # Added here to simplify testing in case we don't have anything to process\n # TODO: Ensure the statistics has the same shape as all the outputs, raw_outputs, and raw_inputs\n statistics = []\n total_raw_outputs = []\n total_raw_inputs = []\n for inner_batch in batched(prepared_inputs, self.input_batch_size): # type: ignore\n outputs = self.llm.generate_outputs(\n inputs=inner_batch,\n num_generations=1,\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n\n formatted_outputs = []\n stats = []\n raw_outputs = []\n raw_inputs = []\n for i, output in enumerate(outputs):\n generation = output[\"generations\"][0]\n raw_inputs.append(inner_batch[i])\n raw_outputs.append(generation or \"\")\n formatted_outputs.append(self._parse_output(generation))\n stats.append(output[\"statistics\"])\n\n final_outputs.extend(formatted_outputs)\n statistics.extend(stats)\n total_raw_outputs.extend(raw_outputs)\n total_raw_inputs.extend(raw_inputs)\n\n yield self._auto_label( # type: ignore\n inputs,\n final_outputs,\n input_positions,\n golden_answers,\n statistics,\n total_raw_outputs,\n total_raw_inputs,\n )\n\n def _prepare_completions(\n self, instruction: str, steps: list[str]\n ) -> List[\"ChatType\"]:\n \"\"\"Helper method to create, given a solution (a list of steps), and a instruction, the\n texts to be completed by the LLM.\n\n Args:\n instruction: Instruction of the problem.\n steps: List of steps that are part of the solution.\n\n Returns:\n List of ChatType, where each ChatType is the prompt corresponding to one of the steps\n to be completed.\n \"\"\"\n prepared_inputs = []\n # Use the number of completions that correspond to a given instruction/steps pair\n # to find afterwards the input that corresponds to a given completion (to do the labelling)\n num_completions = len(steps[:-2])\n for i in range(1, num_completions + 1):\n to_complete = instruction + \" \" + \"\\n\".join(steps[:i])\n prepared_inputs.append(self.format_input({\"instruction\": to_complete}))\n\n return prepared_inputs\n\n def _auto_label(\n self,\n inputs: StepInput,\n final_outputs: list[Completions],\n input_positions: list[tuple[int, int, int]],\n golden_answers: list[str],\n statistics: list[\"LLMStatistics\"],\n raw_outputs: list[str],\n raw_inputs: list[str],\n ) -> StepInput:\n \"\"\"Labels the steps inplace (in the inputs), and returns the inputs.\n\n Args:\n inputs: The original inputs\n final_outputs: List of generations from the LLM.\n It's organized as a list where the elements sent to the LLM are\n grouped together, then each element contains the completions, and\n each completion is a list of steps.\n input_positions: A list with tuples generated in the process method\n that contains (i, j, k) where i is the index of the input, j is the\n index of the solution, and k is the index of the completion.\n golden_answers: List of golden answers for each input.\n statistics: List of statistics from the LLM.\n raw_outputs: List of raw outputs from the LLM.\n raw_inputs: List of raw inputs to the LLM.\n\n Returns:\n Inputs annotated.\n \"\"\"\n for i, (instruction_i, solution_i, step_i) in enumerate(input_positions):\n input = inputs[instruction_i]\n solutions = input[\"solutions\"]\n n_completions = final_outputs[i]\n label = f\" {self.tags[1]}\"\n for completion in n_completions:\n if len(completion) == 0:\n # This can be a failed generation\n label = \"\" # Everyting stays the same\n self._logger.info(\"Completer failed due to empty completion\")\n continue\n if completion[-1] == golden_answers[instruction_i]:\n label = f\" { self.tags[0]}\"\n # If we found one, it's enough as we are doing Hard Estimation\n continue\n # In case we had no solutions from the previous step, otherwise we would have\n # an IndexError\n if not solutions[solution_i]:\n continue\n solutions[solution_i][step_i] += label\n inputs[instruction_i][\"solutions\"] = solutions\n\n for i, input in enumerate(inputs):\n solutions = input[\"solutions\"]\n new_solutions = []\n for solution in solutions:\n if not solution or (len(solution) == 1):\n # The generation may fail to generate the expected\n # completions, or just added an extra empty completion,\n # we skip it.\n # Other possible error is having a list of solutions\n # with a single item, so when we call .pop, we are left\n # with an empty list, so we skip it too.\n new_solutions.append(solution)\n continue\n\n answer = solution.pop()\n label = (\n f\" {self.tags[0]}\"\n if answer == golden_answers[i]\n else f\" {self.tags[1]}\"\n )\n solution[-1] += \" \" + answer + label\n new_solutions.append(solution)\n\n # Only add the solutions if the data was properly parsed\n input[\"solutions\"] = new_solutions if new_solutions else input[\"solutions\"]\n input = self._add_metadata(\n input, statistics[i], raw_outputs[i], raw_inputs[i]\n )\n\n return inputs\n\n def _add_metadata(\n self,\n input: dict[str, Any],\n statistics: list[\"LLMStatistics\"],\n raw_output: Union[str, None],\n raw_input: Union[list[dict[str, Any]], None],\n ) -> dict[str, Any]:\n \"\"\"Adds the `distilabel_metadata` to the input.\n\n This method comes for free in the general Tasks, but as we have reimplemented the `process`,\n we have to repeat it here.\n\n Args:\n input: The input to add the metadata to.\n statistics: The statistics from the LLM.\n raw_output: The raw output from the LLM.\n raw_input: The raw input to the LLM.\n\n Returns:\n The input with the metadata added if applies.\n \"\"\"\n input[\"model_name\"] = self.llm.model_name\n\n if DISTILABEL_METADATA_KEY not in input:\n input[DISTILABEL_METADATA_KEY] = {}\n # If the solutions are splitted afterwards, the statistics should be splitted\n # to avoid counting extra tokens\n input[DISTILABEL_METADATA_KEY][f\"statistics_{self.name}\"] = statistics\n\n # Let some defaults in case something failed and we had None, otherwise when reading\n # the parquet files using pyarrow, the following error will appear:\n # ArrowInvalid: Schema\n if self.add_raw_input:\n input[DISTILABEL_METADATA_KEY][f\"raw_input_{self.name}\"] = raw_input or [\n {\"content\": \"\", \"role\": \"\"}\n ]\n if self.add_raw_output:\n input[DISTILABEL_METADATA_KEY][f\"raw_output_{self.name}\"] = raw_output or \"\"\n return input\n\n @override\n def get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdCompleter.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.format_output","title":"format_output(output, input=None) ","text":"Does nothing. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n \"\"\"Does nothing.\"\"\"\n return {}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.process","title":"process(inputs) ","text":"Does the processing of generation completions for the solutions, and annotate each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)). Parameters: Name Type Description Default inputs StepInput Inputs to the step required Yields: Type Description StepOutput Annotated inputs with the completions. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def process(self, inputs: StepInput) -> \"StepOutput\":\n \"\"\"Does the processing of generation completions for the solutions, and annotate\n each step with the logic found in Figure 2 of the paper, with the hard estimation (Eq. (3)).\n\n Args:\n inputs: Inputs to the step\n\n Yields:\n Annotated inputs with the completions.\n \"\"\"\n\n # A list with all the inputs to be passed to the LLM. Needs another structure to\n # find them afterwards\n prepared_inputs = []\n # Data structure with the indices of the elements.\n # (i, j, k) where i is the input, j is the solution, and k is the completion\n input_positions = []\n golden_answers = []\n for i, input in enumerate(inputs):\n instruction = input[\"instruction\"]\n golden_solution = input[\"golden_solution\"] # This is a single solution\n golden_answers.append(golden_solution[-1])\n # This contains a list of solutions\n solutions = input[\"solutions\"]\n for j, solution in enumerate(solutions):\n # For each solution, that has K steps, we have to generate N completions\n # for the first K-2 steps (-2 because the last 2 steps are the last step, and\n # the answer itself, which can be directly compared against golden answer)\n prepared_completions = self._prepare_completions(instruction, solution)\n prepared_inputs.extend(prepared_completions)\n input_positions.extend(\n [(i, j, k) for k in range(len(prepared_completions))]\n )\n\n # Send the elements in batches to the LLM to speed up the process\n final_outputs = []\n # Added here to simplify testing in case we don't have anything to process\n # TODO: Ensure the statistics has the same shape as all the outputs, raw_outputs, and raw_inputs\n statistics = []\n total_raw_outputs = []\n total_raw_inputs = []\n for inner_batch in batched(prepared_inputs, self.input_batch_size): # type: ignore\n outputs = self.llm.generate_outputs(\n inputs=inner_batch,\n num_generations=1,\n **self.llm.get_generation_kwargs(), # type: ignore\n )\n\n formatted_outputs = []\n stats = []\n raw_outputs = []\n raw_inputs = []\n for i, output in enumerate(outputs):\n generation = output[\"generations\"][0]\n raw_inputs.append(inner_batch[i])\n raw_outputs.append(generation or \"\")\n formatted_outputs.append(self._parse_output(generation))\n stats.append(output[\"statistics\"])\n\n final_outputs.extend(formatted_outputs)\n statistics.extend(stats)\n total_raw_outputs.extend(raw_outputs)\n total_raw_inputs.extend(raw_inputs)\n\n yield self._auto_label( # type: ignore\n inputs,\n final_outputs,\n input_positions,\n golden_answers,\n statistics,\n total_raw_outputs,\n total_raw_inputs,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._prepare_completions","title":"_prepare_completions(instruction, steps) ","text":"Helper method to create, given a solution (a list of steps), and a instruction, the texts to be completed by the LLM. Parameters: Name Type Description Default instruction str Instruction of the problem. required steps list[str] List of steps that are part of the solution. required Returns: Type Description List[ChatType] List of ChatType, where each ChatType is the prompt corresponding to one of the steps List[ChatType] to be completed. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _prepare_completions(\n self, instruction: str, steps: list[str]\n) -> List[\"ChatType\"]:\n \"\"\"Helper method to create, given a solution (a list of steps), and a instruction, the\n texts to be completed by the LLM.\n\n Args:\n instruction: Instruction of the problem.\n steps: List of steps that are part of the solution.\n\n Returns:\n List of ChatType, where each ChatType is the prompt corresponding to one of the steps\n to be completed.\n \"\"\"\n prepared_inputs = []\n # Use the number of completions that correspond to a given instruction/steps pair\n # to find afterwards the input that corresponds to a given completion (to do the labelling)\n num_completions = len(steps[:-2])\n for i in range(1, num_completions + 1):\n to_complete = instruction + \" \" + \"\\n\".join(steps[:i])\n prepared_inputs.append(self.format_input({\"instruction\": to_complete}))\n\n return prepared_inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._auto_label","title":"_auto_label(inputs, final_outputs, input_positions, golden_answers, statistics, raw_outputs, raw_inputs) ","text":"Labels the steps inplace (in the inputs), and returns the inputs. Parameters: Name Type Description Default inputs StepInput The original inputs required final_outputs list[Completions] List of generations from the LLM. It's organized as a list where the elements sent to the LLM are grouped together, then each element contains the completions, and each completion is a list of steps. required input_positions list[tuple[int, int, int]] A list with tuples generated in the process method that contains (i, j, k) where i is the index of the input, j is the index of the solution, and k is the index of the completion. required golden_answers list[str] List of golden answers for each input. required statistics list[LLMStatistics] List of statistics from the LLM. required raw_outputs list[str] List of raw outputs from the LLM. required raw_inputs list[str] List of raw inputs to the LLM. required Returns: Type Description StepInput Inputs annotated. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _auto_label(\n self,\n inputs: StepInput,\n final_outputs: list[Completions],\n input_positions: list[tuple[int, int, int]],\n golden_answers: list[str],\n statistics: list[\"LLMStatistics\"],\n raw_outputs: list[str],\n raw_inputs: list[str],\n) -> StepInput:\n \"\"\"Labels the steps inplace (in the inputs), and returns the inputs.\n\n Args:\n inputs: The original inputs\n final_outputs: List of generations from the LLM.\n It's organized as a list where the elements sent to the LLM are\n grouped together, then each element contains the completions, and\n each completion is a list of steps.\n input_positions: A list with tuples generated in the process method\n that contains (i, j, k) where i is the index of the input, j is the\n index of the solution, and k is the index of the completion.\n golden_answers: List of golden answers for each input.\n statistics: List of statistics from the LLM.\n raw_outputs: List of raw outputs from the LLM.\n raw_inputs: List of raw inputs to the LLM.\n\n Returns:\n Inputs annotated.\n \"\"\"\n for i, (instruction_i, solution_i, step_i) in enumerate(input_positions):\n input = inputs[instruction_i]\n solutions = input[\"solutions\"]\n n_completions = final_outputs[i]\n label = f\" {self.tags[1]}\"\n for completion in n_completions:\n if len(completion) == 0:\n # This can be a failed generation\n label = \"\" # Everyting stays the same\n self._logger.info(\"Completer failed due to empty completion\")\n continue\n if completion[-1] == golden_answers[instruction_i]:\n label = f\" { self.tags[0]}\"\n # If we found one, it's enough as we are doing Hard Estimation\n continue\n # In case we had no solutions from the previous step, otherwise we would have\n # an IndexError\n if not solutions[solution_i]:\n continue\n solutions[solution_i][step_i] += label\n inputs[instruction_i][\"solutions\"] = solutions\n\n for i, input in enumerate(inputs):\n solutions = input[\"solutions\"]\n new_solutions = []\n for solution in solutions:\n if not solution or (len(solution) == 1):\n # The generation may fail to generate the expected\n # completions, or just added an extra empty completion,\n # we skip it.\n # Other possible error is having a list of solutions\n # with a single item, so when we call .pop, we are left\n # with an empty list, so we skip it too.\n new_solutions.append(solution)\n continue\n\n answer = solution.pop()\n label = (\n f\" {self.tags[0]}\"\n if answer == golden_answers[i]\n else f\" {self.tags[1]}\"\n )\n solution[-1] += \" \" + answer + label\n new_solutions.append(solution)\n\n # Only add the solutions if the data was properly parsed\n input[\"solutions\"] = new_solutions if new_solutions else input[\"solutions\"]\n input = self._add_metadata(\n input, statistics[i], raw_outputs[i], raw_inputs[i]\n )\n\n return inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter._add_metadata","title":"_add_metadata(input, statistics, raw_output, raw_input) ","text":"Adds the distilabel_metadata to the input. This method comes for free in the general Tasks, but as we have reimplemented the process , we have to repeat it here. Parameters: Name Type Description Default input dict[str, Any] The input to add the metadata to. required statistics list[LLMStatistics] The statistics from the LLM. required raw_output Union[str, None] The raw output from the LLM. required raw_input Union[list[dict[str, Any]], None] The raw input to the LLM. required Returns: Type Description dict[str, Any] The input with the metadata added if applies. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py def _add_metadata(\n self,\n input: dict[str, Any],\n statistics: list[\"LLMStatistics\"],\n raw_output: Union[str, None],\n raw_input: Union[list[dict[str, Any]], None],\n) -> dict[str, Any]:\n \"\"\"Adds the `distilabel_metadata` to the input.\n\n This method comes for free in the general Tasks, but as we have reimplemented the `process`,\n we have to repeat it here.\n\n Args:\n input: The input to add the metadata to.\n statistics: The statistics from the LLM.\n raw_output: The raw output from the LLM.\n raw_input: The raw input to the LLM.\n\n Returns:\n The input with the metadata added if applies.\n \"\"\"\n input[\"model_name\"] = self.llm.model_name\n\n if DISTILABEL_METADATA_KEY not in input:\n input[DISTILABEL_METADATA_KEY] = {}\n # If the solutions are splitted afterwards, the statistics should be splitted\n # to avoid counting extra tokens\n input[DISTILABEL_METADATA_KEY][f\"statistics_{self.name}\"] = statistics\n\n # Let some defaults in case something failed and we had None, otherwise when reading\n # the parquet files using pyarrow, the following error will appear:\n # ArrowInvalid: Schema\n if self.add_raw_input:\n input[DISTILABEL_METADATA_KEY][f\"raw_input_{self.name}\"] = raw_input or [\n {\"content\": \"\", \"role\": \"\"}\n ]\n if self.add_raw_output:\n input[DISTILABEL_METADATA_KEY][f\"raw_output_{self.name}\"] = raw_output or \"\"\n return input\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdCompleter.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel, Field\n\nclass Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\nclass MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\nMathShepherdCompleter.model_json_schema()\n Returns: Type Description dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/math_shepherd/completer.py @override\ndef get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdCompleter(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdCompleter.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdGenerator","title":"MathShepherdGenerator ","text":" Bases: Task Math Shepherd solution generator. This task is in charge of generating completions for a given instruction, in the format expected by the Math Shepherd Completer task. The attributes make the task flexible to be used with different types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented in the original paper. Before modifying them, review the current defaults to ensure the completions are generated correctly. This task can be used to generate the golden solutions for a given problem if not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer. Only one of solutions or golden_solution will be generated, depending on the value of M. Attributes: Name Type Description system_prompt Optional[str] The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. Take into account that the system prompt includes 2 variables in the Jinja2 template, {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example to steer the model towards a specific type of responses, and few shots to add examples. They can be modified to adapt the system prompt to the dataset and model used without needing to change the full system prompt. extra_rules Optional[str] This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. few_shots Optional[str] Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. M Optional[PositiveInt] Number of completions to generate for each step. By default is set to 1, which will generate the \"golden_solution\". In this case select a stronger model, as it will be used as the source of true during labelling. If M is set to a number greater than 1, the task will generate a list of completions to be labeled by the Math Shepherd Completer task. Input columns - instruction (
str ): The task or instruction. Output columns - golden_solution (
str ): The step by step solution to the instruction. It will be generated if M is equal to 1. - solutions (
List[List[str]] ): A list of possible solutions to the instruction. It will be generated if M is greater than 1. - model_name (
str ): The name of the model used to generate the revision. Categories References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations Examples: Generate the solution for a given instruction (prefer a stronger model here): from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"]'}]]\n Generate M completions for a given instruction (using structured output generation): from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n)\ntask = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n Source code in src/distilabel/steps/tasks/math_shepherd/generator.py class MathShepherdGenerator(Task):\n \"\"\"Math Shepherd solution generator.\n\n This task is in charge of generating completions for a given instruction, in the format expected\n by the Math Shepherd Completer task. The attributes make the task flexible to be used with different\n types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented\n in the original paper. Before modifying them, review the current defaults to ensure the completions\n are generated correctly. This task can be used to generate the golden solutions for a given problem if\n not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer.\n Only one of `solutions` or `golden_solution` will be generated, depending on the value of M.\n\n Attributes:\n system_prompt: The system prompt to be used in the completions. The default one has been\n checked and generates good completions using Llama 3.1 with 8B and 70B,\n but it can be modified to adapt it to the model and dataset selected.\n Take into account that the system prompt includes 2 variables in the Jinja2 template,\n {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example\n to steer the model towards a specific type of responses, and few shots to add examples.\n They can be modified to adapt the system prompt to the dataset and model used without needing\n to change the full system prompt.\n extra_rules: This field can be used to insert extra rules relevant to the type of dataset.\n For example, in the original paper they used GSM8K and MATH datasets, and this field\n can be used to insert the rules for the GSM8K dataset.\n few_shots: Few shots to help the model generating the completions, write them in the\n format of the type of solutions wanted for your dataset.\n M: Number of completions to generate for each step. By default is set to 1, which will\n generate the \"golden_solution\". In this case select a stronger model, as it will be used\n as the source of true during labelling. If M is set to a number greater than 1, the task\n will generate a list of completions to be labeled by the Math Shepherd Completer task.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n\n Output columns:\n - golden_solution (`str`): The step by step solution to the instruction.\n It will be generated if M is equal to 1.\n - solutions (`List[List[str]]`): A list of possible solutions to the instruction.\n It will be generated if M is greater than 1.\n - model_name (`str`): The name of the model used to generate the revision.\n\n Categories:\n - text-generation\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n\n Examples:\n Generate the solution for a given instruction (prefer a stronger model here):\n\n ```python\n from distilabel.steps.tasks import MathShepherdGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n )\n task = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"]'}]]\n ```\n\n Generate M completions for a given instruction (using structured output generation):\n\n ```python\n from distilabel.steps.tasks import MathShepherdGenerator\n from distilabel.models import InferenceEndpointsLLM\n\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n task = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n )\n\n task.load()\n\n result = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n )\n # [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n ```\n \"\"\"\n\n system_prompt: Optional[str] = SYSTEM_PROMPT\n extra_rules: Optional[str] = RULES_GSM8K\n few_shots: Optional[str] = FEW_SHOTS_GSM8K\n M: Optional[PositiveInt] = None\n\n def load(self) -> None:\n super().load()\n if self.system_prompt is not None:\n self.system_prompt = Template(self.system_prompt).render(\n extra_rules=self.extra_rules or \"\",\n few_shots=self.few_shots or \"\",\n structured_prompt=SYSTEM_PROMPT_STRUCTURED\n if self.use_default_structured_output\n else \"\",\n )\n if self.use_default_structured_output:\n self._template = Template(TEMPLATE_STRUCTURED)\n else:\n self._template = Template(TEMPLATE)\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n if self.M:\n return [\"solutions\", \"model_name\"]\n return [\"golden_solution\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n instruction=input[\"instruction\"],\n M=self.M,\n ),\n }\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n output_name = \"solutions\" if self.M else \"golden_solution\"\n\n if output is None:\n input.update(**{output_name: None})\n return input\n\n if self.M:\n output_parsed = (\n self._format_structured_output(output)\n if self.use_default_structured_output\n else output.split(\"---\")\n )\n solutions = [split_solution_steps(o) for o in output_parsed]\n else:\n output_parsed = (\n self._format_structured_output(output)[0]\n if self.use_default_structured_output\n else output\n )\n solutions = split_solution_steps(output_parsed)\n\n input.update(**{output_name: solutions})\n return input\n\n @override\n def get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdGenerator.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(self, output: str) -> list[str]:\n default_output = [\"\"] * self.M if self.M else [\"\"]\n if parsed_output := parse_json_response(output):\n solutions = parsed_output[\"solutions\"]\n extracted_solutions = [o[\"solution\"] for o in solutions]\n if len(extracted_solutions) != self.M:\n extracted_solutions = default_output\n return extracted_solutions\n return default_output\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MathShepherdGenerator.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel, Field\n\nclass Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\nclass MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\nMathShepherdGenerator.model_json_schema()\n Returns: Type Description dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/math_shepherd/generator.py @override\ndef get_structured_output(self) -> dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel, Field\n\n class Solution(BaseModel):\n solution: str = Field(..., description=\"Step by step solution leading to the final answer\")\n\n class MathShepherdGenerator(BaseModel):\n solutions: list[Solution] = Field(..., description=\"List of solutions\")\n\n MathShepherdGenerator.model_json_schema()\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"$defs\": {\n \"Solution\": {\n \"properties\": {\n \"solution\": {\n \"description\": \"Step by step solution leading to the final answer\",\n \"title\": \"Solution\",\n \"type\": \"string\",\n }\n },\n \"required\": [\"solution\"],\n \"title\": \"Solution\",\n \"type\": \"object\",\n }\n },\n \"properties\": {\n \"solutions\": {\n \"description\": \"List of solutions\",\n \"items\": {\"$ref\": \"#/$defs/Solution\"},\n \"title\": \"Solutions\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"solutions\"],\n \"title\": \"MathShepherdGenerator\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.FormatPRM","title":"FormatPRM ","text":" Bases: Step Helper step to transform the data into the format expected by the PRM model. This step can be used to format the data in one of 2 formats: Following the format presented in peiyi9979/Math-Shepherd, in which case this step creates the columns input and label, where the input is the instruction with the solution (and the tag replaced by a token), and the label is the instruction with the solution, both separated by a newline. Following TRL's format for training, which generates the columns prompt, completions, and labels. The labels correspond to the original tags replaced by boolean values, where True represents correct steps. Attributes: Name Type Description format Literal['math-shepherd', 'trl'] The format to use for the PRM model. \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format prepared to train the model using TRL. step_token str String that serves as a unique token denoting the position for predicting the step score. tags list[str] List of tags that represent the correct and incorrect steps. This only needs to be informed if it's different than the default in MathShepherdCompleter . Input columns - instruction (
str ): The task or instruction. - solutions (
list[str] ): List of steps with a solution to the task. Output columns - input (
str ): The instruction with the solutions, where the label tags are replaced by a token. - label (
str ): The instruction with the solutions. - prompt (
str ): The instruction with the solutions, where the label tags are replaced by a token. - completions (
List[str] ): The solution represented as a list of steps. - labels (
List[bool] ): The labels, as a list of booleans, where True represents a good response. Categories - text-manipulation
- columns
References Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations - peiyi9979/Math-Shepherd
Examples: Prepare your data to train a PRM model with the Math-Shepherd format: from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM()\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\nresult = next(formatter.process(result))\n Prepare your data to train a PRM model with the TRL format: from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM(format=\"trl\")\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\n\nresult = next(formatter.process(result))\n# {\n# \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"solutions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n# ],\n# \"prompt\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"completions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n# ],\n# \"labels\": [\n# true,\n# true,\n# true\n# ]\n# }\n Citations: ```\n@misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n}\n```\n Source code in src/distilabel/steps/tasks/math_shepherd/utils.py class FormatPRM(Step):\n \"\"\"Helper step to transform the data into the format expected by the PRM model.\n\n This step can be used to format the data in one of 2 formats:\n Following the format presented\n in [peiyi9979/Math-Shepherd](https://huggingface.co/datasets/peiyi9979/Math-Shepherd?row=0),\n in which case this step creates the columns input and label, where the input is the instruction\n with the solution (and the tag replaced by a token), and the label is the instruction\n with the solution, both separated by a newline.\n Following TRL's format for training, which generates the columns prompt, completions, and labels.\n The labels correspond to the original tags replaced by boolean values, where True represents\n correct steps.\n\n Attributes:\n format: The format to use for the PRM model.\n \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format\n prepared to train the model using TRL.\n step_token: String that serves as a unique token denoting the position\n for predicting the step score.\n tags: List of tags that represent the correct and incorrect steps.\n This only needs to be informed if it's different than the default in\n `MathShepherdCompleter`.\n\n Input columns:\n - instruction (`str`): The task or instruction.\n - solutions (`list[str]`): List of steps with a solution to the task.\n\n Output columns:\n - input (`str`): The instruction with the solutions, where the label tags\n are replaced by a token.\n - label (`str`): The instruction with the solutions.\n - prompt (`str`): The instruction with the solutions, where the label tags\n are replaced by a token.\n - completions (`List[str]`): The solution represented as a list of steps.\n - labels (`List[bool]`): The labels, as a list of booleans, where True represents\n a good response.\n\n Categories:\n - text-manipulation\n - columns\n\n References:\n - [`Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations`](https://arxiv.org/abs/2312.08935)\n - [peiyi9979/Math-Shepherd](https://huggingface.co/datasets/peiyi9979/Math-Shepherd?row=0)\n\n Examples:\n Prepare your data to train a PRM model with the Math-Shepherd format:\n\n ```python\n from distilabel.steps.tasks import FormatPRM\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(columns=[\"solutions\"])\n expand_columns.load()\n\n # Define our PRM formatter\n formatter = FormatPRM()\n formatter.load()\n\n # Expand the solutions column as it comes from the MathShepherdCompleter\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n )\n result = next(formatter.process(result))\n ```\n\n Prepare your data to train a PRM model with the TRL format:\n\n ```python\n from distilabel.steps.tasks import FormatPRM\n from distilabel.steps import ExpandColumns\n\n expand_columns = ExpandColumns(columns=[\"solutions\"])\n expand_columns.load()\n\n # Define our PRM formatter\n formatter = FormatPRM(format=\"trl\")\n formatter.load()\n\n # Expand the solutions column as it comes from the MathShepherdCompleter\n result = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it\\'s half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n )\n\n result = next(formatter.process(result))\n # {\n # \"instruction\": \"Janet\\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # \"solutions\": [\n # \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n # \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n # \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n # ],\n # \"prompt\": \"Janet\\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n # \"completions\": [\n # \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n # \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n # \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n # ],\n # \"labels\": [\n # true,\n # true,\n # true\n # ]\n # }\n ```\n\n Citations:\n\n ```\n @misc{wang2024mathshepherdverifyreinforcellms,\n title={Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations},\n author={Peiyi Wang and Lei Li and Zhihong Shao and R. X. Xu and Damai Dai and Yifei Li and Deli Chen and Y. Wu and Zhifang Sui},\n year={2024},\n eprint={2312.08935},\n archivePrefix={arXiv},\n primaryClass={cs.AI},\n url={https://arxiv.org/abs/2312.08935},\n }\n ```\n \"\"\"\n\n format: Literal[\"math-shepherd\", \"trl\"] = \"math-shepherd\"\n step_token: str = \"\u043a\u0438\"\n tags: list[str] = [\"+\", \"-\"]\n\n def model_post_init(self, __context: Any) -> None:\n super().model_post_init(__context)\n if self.format == \"math-shepherd\":\n self._formatter = self._format_math_shepherd\n else:\n self._formatter = self._format_trl\n\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"instruction\", \"solutions\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n if self.format == \"math-shepherd\":\n return [\"input\", \"label\"]\n return [\"prompt\", \"completions\", \"labels\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The process prepares the data for the `APIGenGenerator` task.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n self._formatter(input)\n\n yield inputs # type: ignore\n\n def _format_math_shepherd(\n self, input: dict[str, str]\n ) -> dict[str, Union[str, list[str]]]:\n instruction = input[\"instruction\"]\n replaced = []\n # At this stage, the \"solutions\" column can only contain a single solution,\n # and the last item of each solution is the tag.\n solution = input[\"solutions\"]\n for step in solution:\n # Check there's a string, because the step that generated\n # the solutions could have failed, and we would have an empty list.\n replaced.append(step[:-1] + self.step_token if len(step) > 1 else step)\n\n input[\"input\"] = instruction + \" \" + \"\\n\".join(replaced)\n input[\"label\"] = instruction + \" \" + \"\\n\".join(solution)\n\n return input # type: ignore\n\n def _format_trl(\n self, input: dict[str, str]\n ) -> dict[str, Union[str, list[str], list[bool]]]:\n input[\"prompt\"] = input[\"instruction\"]\n completions: list[str] = []\n labels: list[bool] = []\n for step in input[\"solutions\"]:\n token = step[-1]\n completions.append(step[:-1].strip())\n labels.append(True if token == self.tags[0] else False)\n\n input[\"completions\"] = completions # type: ignore\n input[\"labels\"] = labels # type: ignore\n\n return input # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.FormatPRM.process","title":"process(inputs) ","text":"The process prepares the data for the APIGenGenerator task. If a single example is provided, it is copied to avoid raising an error. Parameters: Name Type Description Default inputs StepInput A list of dictionaries with the input data. required Yields: Type Description StepOutput A list of dictionaries with the output data. Source code in src/distilabel/steps/tasks/math_shepherd/utils.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"The process prepares the data for the `APIGenGenerator` task.\n\n If a single example is provided, it is copied to avoid raising an error.\n\n Args:\n inputs: A list of dictionaries with the input data.\n\n Yields:\n A list of dictionaries with the output data.\n \"\"\"\n for input in inputs:\n self._formatter(input)\n\n yield inputs # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM","title":"PairRM ","text":" Bases: Step Rank the candidates based on the input using the LLM model. Attributes: Name Type Description model str The model to use for the ranking. Defaults to \"llm-blender/PairRM\" . instructions Optional[str] The instructions to use for the model. Defaults to None . Input columns - inputs (
List[Dict[str, Any]] ): The input text or conversation to rank the candidates for. - candidates (
List[Dict[str, Any]] ): The candidates to rank. Output columns - ranks (
List[int] ): The ranks of the candidates based on the input. - ranked_candidates (
List[Dict[str, Any]] ): The candidates ranked based on the input. - model_name (
str ): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\" . References - LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion.
- Pair Ranking Model.
Categories Note This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM . Examples: Rank LLM candidates: from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'input': 'Hello, how are you?',\n# 'candidates': ['fine', 'good', 'bad'],\n# 'ranks': [2, 1, 3],\n# 'ranked_candidates': ['good', 'fine', 'bad'],\n# 'model_name': 'llm-blender/PairRM',\n# }\n# ]\n Citations @misc{jiang2023llmblenderensemblinglargelanguage,\n title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n year={2023},\n eprint={2306.02561},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2306.02561},\n}\n Source code in src/distilabel/steps/tasks/pair_rm.py class PairRM(Step):\n \"\"\"Rank the candidates based on the input using the `LLM` model.\n\n Attributes:\n model: The model to use for the ranking. Defaults to `\"llm-blender/PairRM\"`.\n instructions: The instructions to use for the model. Defaults to `None`.\n\n Input columns:\n - inputs (`List[Dict[str, Any]]`): The input text or conversation to rank the candidates for.\n - candidates (`List[Dict[str, Any]]`): The candidates to rank.\n\n Output columns:\n - ranks (`List[int]`): The ranks of the candidates based on the input.\n - ranked_candidates (`List[Dict[str, Any]]`): The candidates ranked based on the input.\n - model_name (`str`): The model name used to rank the candidate responses. Defaults to `\"llm-blender/PairRM\"`.\n\n References:\n - [LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion](https://arxiv.org/abs/2306.02561).\n - [Pair Ranking Model](https://huggingface.co/llm-blender/PairRM).\n\n Categories:\n - preference\n\n Note:\n This step differs to other tasks as there is a single implementation of this model\n currently, and we will use a specific `LLM`.\n\n Examples:\n Rank LLM candidates:\n\n ```python\n from distilabel.steps.tasks import PairRM\n\n # Consider this as a placeholder for your actual LLM.\n pair_rm = PairRM()\n\n pair_rm.load()\n\n result = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n )\n # result\n # [\n # {\n # 'input': 'Hello, how are you?',\n # 'candidates': ['fine', 'good', 'bad'],\n # 'ranks': [2, 1, 3],\n # 'ranked_candidates': ['good', 'fine', 'bad'],\n # 'model_name': 'llm-blender/PairRM',\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{jiang2023llmblenderensemblinglargelanguage,\n title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n year={2023},\n eprint={2306.02561},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2306.02561},\n }\n ```\n \"\"\"\n\n model: str = \"llm-blender/PairRM\"\n instructions: Optional[str] = None\n\n def load(self) -> None:\n \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n custom library for running the inference for the PairRM models.\"\"\"\n try:\n import llm_blender\n except ImportError as e:\n raise ImportError(\n \"The `llm_blender` package is required to use the `PairRM` class.\"\n \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n ) from e\n\n self._blender = llm_blender.Blender()\n self._blender.loadranker(self.model)\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The input columns correspond to the two required arguments from `Blender.rank`:\n `inputs` and `candidates`.\"\"\"\n return [\"input\", \"candidates\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n \"\"\"The outputs will include the `ranks` and the `ranked_candidates`.\"\"\"\n return [\"ranks\", \"ranked_candidates\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n where the `input` corresponds to the instruction of a model and `candidates` are a\n list of responses to be ranked.\n \"\"\"\n return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates the ranks for the candidates based on the input.\n\n The ranks are the positions of the candidates, where lower is better,\n and the ranked candidates correspond to the candidates sorted according to the\n ranks obtained.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n \"\"\"\n input_texts = []\n candidates = []\n for input in inputs:\n formatted_input = self.format_input(input)\n input_texts.append(formatted_input[\"input\"])\n candidates.append(formatted_input[\"candidates\"])\n\n instructions = (\n [self.instructions] * len(input_texts) if self.instructions else None\n )\n\n ranks = self._blender.rank(\n input_texts,\n candidates,\n instructions=instructions,\n return_scores=False,\n batch_size=self.input_batch_size,\n )\n # Sort the candidates based on the ranks\n ranked_candidates = np.take_along_axis(\n np.array(candidates), ranks - 1, axis=1\n ).tolist()\n ranks = ranks.tolist()\n for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n input[\"ranks\"] = rank\n input[\"ranked_candidates\"] = ranked_candidate\n input[\"model_name\"] = self.model\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.inputs","title":"inputs: StepColumns property ","text":"The input columns correspond to the two required arguments from Blender.rank : inputs and candidates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.outputs","title":"outputs: StepColumns property ","text":"The outputs will include the ranks and the ranked_candidates . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.load","title":"load() ","text":"Loads the PairRM model provided via model with llm_blender.Blender , which is the custom library for running the inference for the PairRM models. Source code in src/distilabel/steps/tasks/pair_rm.py def load(self) -> None:\n \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n custom library for running the inference for the PairRM models.\"\"\"\n try:\n import llm_blender\n except ImportError as e:\n raise ImportError(\n \"The `llm_blender` package is required to use the `PairRM` class.\"\n \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n ) from e\n\n self._blender = llm_blender.Blender()\n self._blender.loadranker(self.model)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.format_input","title":"format_input(input) ","text":"The input is expected to be a dictionary with the keys input and candidates , where the input corresponds to the instruction of a model and candidates are a list of responses to be ranked. Source code in src/distilabel/steps/tasks/pair_rm.py def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n where the `input` corresponds to the instruction of a model and `candidates` are a\n list of responses to be ranked.\n \"\"\"\n return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.process","title":"process(inputs) ","text":"Generates the ranks for the candidates based on the input. The ranks are the positions of the candidates, where lower is better, and the ranked candidates correspond to the candidates sorted according to the ranks obtained. Parameters: Name Type Description Default inputs StepInput A list of Python dictionaries with the inputs of the task. required Yields: Type Description StepOutput An iterator with the inputs containing the ranks , ranked_candidates , and model_name . Source code in src/distilabel/steps/tasks/pair_rm.py def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n \"\"\"Generates the ranks for the candidates based on the input.\n\n The ranks are the positions of the candidates, where lower is better,\n and the ranked candidates correspond to the candidates sorted according to the\n ranks obtained.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n \"\"\"\n input_texts = []\n candidates = []\n for input in inputs:\n formatted_input = self.format_input(input)\n input_texts.append(formatted_input[\"input\"])\n candidates.append(formatted_input[\"candidates\"])\n\n instructions = (\n [self.instructions] * len(input_texts) if self.instructions else None\n )\n\n ranks = self._blender.rank(\n input_texts,\n candidates,\n instructions=instructions,\n return_scores=False,\n batch_size=self.input_batch_size,\n )\n # Sort the candidates based on the ranks\n ranked_candidates = np.take_along_axis(\n np.array(candidates), ranks - 1, axis=1\n ).tolist()\n ranks = ranks.tolist()\n for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n input[\"ranks\"] = rank\n input[\"ranked_candidates\"] = ranked_candidate\n input[\"model_name\"] = self.model\n\n yield inputs\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval","title":"PrometheusEval ","text":" Bases: Task Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness , harmlessness , honesty , factual-validity , and reasoning , that can be overridden via rubrics , and the selected rubric is set via the attribute rubric . Note The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too. Attributes: Name Type Description mode Literal['absolute', 'relative'] the evaluation mode to use, either absolute or relative . It defines whether the task will evaluate one or two generations. rubric str the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness , harmlessness , honesty , factual-validity , or reasoning . Those will only work if using the default rubrics , otherwise, the provided rubrics should be used. rubrics Optional[Dict[str, str]] a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness , harmlessness , honesty , factual-validity , and reasoning . reference bool a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs. _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instruction (
str ): The instruction to use as reference. - generation (
str , optional): The generated text from the given instruction . This column is required if mode=absolute . - generations (
List[str] , optional): The generated texts from the given instruction . It should contain 2 generations only. This column is required if mode=relative . - reference (
str , optional): The reference / golden answer for the instruction , to be used by the LLM for comparison against. Output columns - feedback (
str ): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided. - result (
Union[int, Literal[\"A\", \"B\"]] ): If mode=absolute , then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative , then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B' . - model_name (
str ): The model name used to generate the feedback and result . Categories References - Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models
- prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf
Examples: Critique and evaluate LLM generation quality using Prometheus 2_0: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Critique for relative evaluation: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generations': ['something done', 'other thing'],\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 'something done',\n# }\n# ]\n Critique with a custom rubric: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n }\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Critique using a reference answer: from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'reference': 'this is a reference answer',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n Citations @misc{kim2024prometheus2opensource,\n title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n year={2024},\n eprint={2405.01535},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2405.01535},\n}\n Source code in src/distilabel/steps/tasks/prometheus_eval.py class PrometheusEval(Task):\n \"\"\"Critique and rank the quality of generations from an `LLM` using Prometheus 2.0.\n\n `PrometheusEval` is a task created for Prometheus 2.0, covering both the absolute and relative\n evaluations. The absolute evaluation i.e. `mode=\"absolute\"` is used to evaluate a single generation from\n an LLM for a given instruction. The relative evaluation i.e. `mode=\"relative\"` is used to evaluate two generations from an LLM\n for a given instruction.\n Both evaluations provide the possibility of using a reference answer to compare with or withoug\n the `reference` attribute, and both are based on a score rubric that critiques the generation/s\n based on the following default aspects: `helpfulness`, `harmlessness`, `honesty`, `factual-validity`,\n and `reasoning`, that can be overridden via `rubrics`, and the selected rubric is set via the attribute\n `rubric`.\n\n Note:\n The `PrometheusEval` task is better suited and intended to be used with any of the Prometheus 2.0\n models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0,\n and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting\n and quality is not guaranteed if using another model, even though some other models may be able to\n correctly follow the formatting and generate insightful critiques too.\n\n Attributes:\n mode: the evaluation mode to use, either `absolute` or `relative`. It defines whether the task\n will evaluate one or two generations.\n rubric: the score rubric to use within the prompt to run the critique based on different aspects.\n Can be any existing key in the `rubrics` attribute, which by default means that it can be:\n `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, or `reasoning`. Those will only\n work if using the default `rubrics`, otherwise, the provided `rubrics` should be used.\n rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are\n the rubric names and the values are the rubric descriptions. The default rubrics are the following:\n `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, and `reasoning`.\n reference: a boolean flag to indicate whether a reference answer / completion will be provided, so\n that the model critique is based on the comparison with it. It implies that the column `reference`\n needs to be provided within the input data in addition to the rest of the inputs.\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instruction (`str`): The instruction to use as reference.\n - generation (`str`, optional): The generated text from the given `instruction`. This column is required\n if `mode=absolute`.\n - generations (`List[str]`, optional): The generated texts from the given `instruction`. It should\n contain 2 generations only. This column is required if `mode=relative`.\n - reference (`str`, optional): The reference / golden answer for the `instruction`, to be used by the LLM\n for comparison against.\n\n Output columns:\n - feedback (`str`): The feedback explaining the result below, as critiqued by the LLM using the\n pre-defined score rubric, compared against `reference` if provided.\n - result (`Union[int, Literal[\"A\", \"B\"]]`): If `mode=absolute`, then the result contains the score for the\n `generation` in a likert-scale from 1-5, otherwise, if `mode=relative`, then the result contains either\n \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of `generations` if `result='A'` or the\n index 1 if `result='B'`.\n - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n Categories:\n - critique\n - preference\n\n References:\n - [Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models](https://arxiv.org/abs/2405.01535)\n - [prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf](https://github.com/prometheus-eval/prometheus-eval)\n\n Examples:\n Critique and evaluate LLM generation quality using Prometheus 2_0:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Critique for relative evaluation:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generations': ['something done', 'other thing'],\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 'something done',\n # }\n # ]\n ```\n\n Critique with a custom rubric:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\\\nScore 1: A\\\\nScore 2: B\\\\nScore 3: C\\\\nScore 4: D\\\\nScore 5: E\"\n }\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Critique using a reference answer:\n\n ```python\n from distilabel.steps.tasks import PrometheusEval\n from distilabel.models import vLLM\n\n # Consider this as a placeholder for your actual LLM.\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n )\n\n prometheus.load()\n\n result = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'make something',\n # 'generation': 'something done',\n # 'reference': 'this is a reference answer',\n # 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n # 'feedback': 'the feedback',\n # 'result': 6,\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{kim2024prometheus2opensource,\n title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n year={2024},\n eprint={2405.01535},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2405.01535},\n }\n ```\n \"\"\"\n\n mode: Literal[\"absolute\", \"relative\"]\n rubric: str\n rubrics: Optional[Dict[str, str]] = Field(default=_DEFAULT_RUBRICS)\n reference: bool = False\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n @model_validator(mode=\"after\")\n def validate_rubric_and_rubrics(self) -> Self:\n if not isinstance(self.rubrics, dict) or len(self.rubrics) < 1:\n raise DistilabelUserError(\n \"Provided `rubrics` must be a Python dictionary with string keys and string values.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n def rubric_matches_pattern(rubric: str) -> bool:\n \"\"\"Checks if the provided rubric matches the pattern of the default rubrics.\"\"\"\n pattern = r\"^\\[.*?\\]\\n(?:Score [1-4]: .*?\\n){4}(?:Score 5: .*?)\"\n return bool(re.match(pattern, rubric, re.MULTILINE))\n\n if not all(rubric_matches_pattern(value) for value in self.rubrics.values()):\n raise DistilabelUserError(\n \"Provided rubrics should match the format of the default rubrics, which\"\n \" is as follows: `[<scoring criteria>]\\nScore 1: <description>\\nScore 2: <description>\\n\"\n \"Score 3: <description>\\nScore 4: <description>\\nScore 5: <description>`; replacing\"\n \" `<scoring criteria>` and `<description>` with the actual criteria and description\"\n \" for each or the scores, respectively.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n if self.rubric not in self.rubrics:\n raise DistilabelUserError(\n f\"Provided rubric '{self.rubric}' is not among the available rubrics: {', '.join(self.rubrics.keys())}.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n return self\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n depending on the `mode` value, and either with or without reference, depending on the\n value of `reference`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"prometheus\"\n / (\n f\"{self.mode}_without_reference.jinja2\"\n if self.reference is False\n else f\"{self.mode}_with_reference.jinja2\"\n )\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The default inputs for the task are the `instruction` and the `generation`\n if `reference=False`, otherwise, the inputs are `instruction`, `generation`, and\n `reference`.\"\"\"\n if self.mode == \"absolute\":\n if self.reference:\n return [\"instruction\", \"generation\", \"reference\"]\n return [\"instruction\", \"generation\"]\n else:\n if self.reference:\n return [\"instruction\", \"generations\", \"reference\"]\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n from the user, including a pre-defined system prompt.\"\"\"\n template_kwargs = {\n \"instruction\": input[\"instruction\"],\n \"rubric\": self.rubrics[self.rubric],\n }\n if self.reference:\n template_kwargs[\"reference\"] = input[\"reference\"]\n\n if self.mode == \"absolute\":\n if not isinstance(input[\"generation\"], str):\n raise DistilabelUserError(\n f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n \" should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generation\"] = input[\"generation\"]\n system_message = (\n \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n \" for performance.\"\n )\n else: # self.mode == \"relative\"\n if (\n not isinstance(input[\"generations\"], list)\n or not all(\n isinstance(generation, str) for generation in input[\"generations\"]\n )\n or len(input[\"generations\"]) != 2\n ):\n raise DistilabelUserError(\n f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generations\"] = input[\"generations\"]\n system_message = (\n \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n \" individual performances, highlighting how each stands relative to others within the\"\n \" same cohort.\"\n )\n\n return [\n {\n \"role\": \"system\",\n \"content\": system_message,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(**template_kwargs), # type: ignore\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task are the `feedback` and the `result` generated by Prometheus,\n as well as the `model_name` which is automatically included based on the `LLM` used.\n \"\"\"\n return [\"feedback\", \"result\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n using a regex from the Prometheus output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Optionally provided in case it's useful to build the output.\n\n Returns:\n A dict with the keys `feedback` and `result` generated by the LLM.\n \"\"\"\n if output is None:\n return {\"feedback\": None, \"result\": None}\n\n parts = output.split(\"[RESULT]\")\n if len(parts) != 2:\n return {\"feedback\": None, \"result\": None}\n\n feedback, result = parts[0].strip(), parts[1].strip()\n if feedback.startswith(\"Feedback:\"):\n feedback = feedback[len(\"Feedback:\") :].strip()\n if self.mode == \"absolute\":\n if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": int(result)}\n else: # self.mode == \"relative\"\n if result not in [\"A\", \"B\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": result}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.inputs","title":"inputs: List[str] property ","text":"The default inputs for the task are the instruction and the generation if reference=False , otherwise, the inputs are instruction , generation , and reference . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.outputs","title":"outputs: List[str] property ","text":"The output for the task are the feedback and the result generated by Prometheus, as well as the model_name which is automatically included based on the LLM used. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.load","title":"load() ","text":"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation depending on the mode value, and either with or without reference, depending on the value of reference . Source code in src/distilabel/steps/tasks/prometheus_eval.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n depending on the `mode` value, and either with or without reference, depending on the\n value of `reference`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"prometheus\"\n / (\n f\"{self.mode}_without_reference.jinja2\"\n if self.reference is False\n else f\"{self.mode}_with_reference.jinja2\"\n )\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType where the prompt is formatted according to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction from the user, including a pre-defined system prompt. Source code in src/distilabel/steps/tasks/prometheus_eval.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n from the user, including a pre-defined system prompt.\"\"\"\n template_kwargs = {\n \"instruction\": input[\"instruction\"],\n \"rubric\": self.rubrics[self.rubric],\n }\n if self.reference:\n template_kwargs[\"reference\"] = input[\"reference\"]\n\n if self.mode == \"absolute\":\n if not isinstance(input[\"generation\"], str):\n raise DistilabelUserError(\n f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n \" should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generation\"] = input[\"generation\"]\n system_message = (\n \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n \" for performance.\"\n )\n else: # self.mode == \"relative\"\n if (\n not isinstance(input[\"generations\"], list)\n or not all(\n isinstance(generation, str) for generation in input[\"generations\"]\n )\n or len(input[\"generations\"]) != 2\n ):\n raise DistilabelUserError(\n f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n page=\"components-gallery/tasks/prometheuseval/\",\n )\n\n template_kwargs[\"generations\"] = input[\"generations\"]\n system_message = (\n \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n \" individual performances, highlighting how each stands relative to others within the\"\n \" same cohort.\"\n )\n\n return [\n {\n \"role\": \"system\",\n \"content\": system_message,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(**template_kwargs), # type: ignore\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dict with the keys feedback and result captured using a regex from the Prometheus output. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Optionally provided in case it's useful to build the output. required Returns: Type Description Dict[str, Any] A dict with the keys feedback and result generated by the LLM. Source code in src/distilabel/steps/tasks/prometheus_eval.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n using a regex from the Prometheus output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Optionally provided in case it's useful to build the output.\n\n Returns:\n A dict with the keys `feedback` and `result` generated by the LLM.\n \"\"\"\n if output is None:\n return {\"feedback\": None, \"result\": None}\n\n parts = output.split(\"[RESULT]\")\n if len(parts) != 2:\n return {\"feedback\": None, \"result\": None}\n\n feedback, result = parts[0].strip(), parts[1].strip()\n if feedback.startswith(\"Feedback:\"):\n feedback = feedback[len(\"Feedback:\") :].strip()\n if self.mode == \"absolute\":\n if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": int(result)}\n else: # self.mode == \"relative\"\n if result not in [\"A\", \"B\"]:\n return {\"feedback\": None, \"result\": None}\n return {\"feedback\": feedback, \"result\": result}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer","title":"QualityScorer ","text":" Bases: Task Score responses based on their quality using an LLM . QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction. Attributes: Name Type Description _template Union[Template, None] a Jinja2 template used to format the input for the LLM. Input columns - instruction (
str ): The instruction that was used to generate the responses . - responses (
List[str] ): The responses to be scored. Each response forms a pair with the instruction. Output columns - scores (
List[float] ): The score for each instruction. - model_name (
str ): The model name used to generate the scores. Categories References What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning Examples: Evaluate the quality of your instructions: from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n# result\n[\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n]\n Generate structured output with default schema: from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n}\n Source code in src/distilabel/steps/tasks/quality_scorer.py class QualityScorer(Task):\n \"\"\"Score responses based on their quality using an `LLM`.\n\n `QualityScorer` is a pre-defined task that defines the `instruction` as the input\n and `score` as the output. This task is used to rate the quality of instructions and responses.\n It's an implementation of the quality score task from the paper 'What Makes Good Data\n for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs\n are scored in terms of quality, obtaining a quality score for each instruction.\n\n Attributes:\n _template: a Jinja2 template used to format the input for the LLM.\n\n Input columns:\n - instruction (`str`): The instruction that was used to generate the `responses`.\n - responses (`List[str]`): The responses to be scored. Each response forms a pair with the instruction.\n\n Output columns:\n - scores (`List[float]`): The score for each instruction.\n - model_name (`str`): The model name used to generate the scores.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n Examples:\n Evaluate the quality of your instructions:\n\n ```python\n from distilabel.steps.tasks import QualityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n scorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n )\n # result\n [\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n ]\n ```\n\n Generate structured output with default schema:\n\n ```python\n from distilabel.steps.tasks import QualityScorer\n from distilabel.models import InferenceEndpointsLLM\n\n scorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n )\n\n scorer.load()\n\n result = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n )\n\n # result\n [{'instruction': 'instruction',\n 'responses': ['good response', 'weird response', 'bad response'],\n 'scores': [1, 2, 3],\n 'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n ```\n @misc{liu2024makesgooddataalignment,\n title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n year={2024},\n eprint={2312.15685},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2312.15685},\n }\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"quality-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task are `instruction` and `responses`.\"\"\"\n return [\"instruction\", \"responses\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], responses=input[\"responses\"]\n ),\n }\n ]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `scores` containing the quality score for each\n response in `responses`.\"\"\"\n return [\"scores\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"responses\"]) - 1:\n break\n return {\"scores\": scores}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaQualityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaQualityScorer\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the scores, and a list with them.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n {\n \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n \"responses\": [\n f\"<PLACEHOLDER_{f'RESPONSE_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.inputs","title":"inputs: List[str] property ","text":"The inputs for the task are instruction and responses . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.outputs","title":"outputs property ","text":"The output for the task is a list of scores containing the quality score for each response in responses . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/quality_scorer.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"quality-scorer.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/quality_scorer.py def format_input(self, input: Dict[str, Any]) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], responses=input[\"responses\"]\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_output","title":"format_output(output, input) ","text":"The output is formatted as a list with the score of each instruction-response pair. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Dict[str, Any] the input to the task. Used for obtaining the number of responses. required Returns: Type Description Dict[str, Any] A dict with the key scores containing the scores for each instruction-response pair. Source code in src/distilabel/steps/tasks/quality_scorer.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with the key `scores` containing the scores for each instruction-response pair.\n \"\"\"\n if output is None:\n return {\"scores\": [None] * len(input[\"responses\"])}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n scores = []\n score_lines = output.split(\"\\n\")\n\n for i, line in enumerate(score_lines):\n match = _PARSE_SCORE_LINE_REGEX.match(line)\n score = float(match.group(1)) if match else None\n scores.append(score)\n if i == len(input[\"responses\"]) - 1:\n break\n return {\"scores\": scores}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaQualityScorer(BaseModel):\n scores: List[int]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/quality_scorer.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaQualityScorer(BaseModel):\n scores: List[int]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n return {\n \"properties\": {\n \"scores\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Scores\",\n \"type\": \"array\",\n }\n },\n \"required\": [\"scores\"],\n \"title\": \"SchemaQualityScorer\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with the scores, and a list with them. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/quality_scorer.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the scores, and a list with them.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n return {\"scores\": [None] * len(input[\"responses\"])}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct","title":"SelfInstruct ","text":" Bases: Task Generate instructions based on a given input using an LLM . SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\". Attributes: Name Type Description num_instructions int The number of instructions to be generated. Defaults to 5. criteria_for_query_generation str The criteria for the query generation. Defaults to the criteria defined within the paper. application_description str The description of the AI application that one want to build with these instructions. Defaults to AI assistant . Input columns - input (
str ): The input to generate the instructions. It's also called seed in the paper. Output columns - instructions (
List[str] ): The generated instructions. - model_name (
str ): The model name used to generate the instructions. Categories Reference Self-Instruct: Aligning Language Models with Self-Generated Instructions Examples: Generate instructions based on a given input: from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n# {\n# 'input': 'instruction',\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n# }\n# ]\n Citations @misc{wang2023selfinstructaligninglanguagemodels,\n title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n year={2023},\n eprint={2212.10560},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2212.10560},\n}\n Source code in src/distilabel/steps/tasks/self_instruct.py class SelfInstruct(Task):\n \"\"\"Generate instructions based on a given input using an `LLM`.\n\n `SelfInstruct` is a pre-defined task that, given a number of instructions, a\n certain criteria for query generations, an application description, and an input,\n generates a number of instruction related to the given input and following what\n is stated in the criteria for query generation and the application description.\n It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning\n Language Models with Self-Generated Instructions\".\n\n Attributes:\n num_instructions: The number of instructions to be generated. Defaults to 5.\n criteria_for_query_generation: The criteria for the query generation. Defaults\n to the criteria defined within the paper.\n application_description: The description of the AI application that one want\n to build with these instructions. Defaults to `AI assistant`.\n\n Input columns:\n - input (`str`): The input to generate the instructions. It's also called seed in\n the paper.\n\n Output columns:\n - instructions (`List[str]`): The generated instructions.\n - model_name (`str`): The model name used to generate the instructions.\n\n Categories:\n - text-generation\n\n Reference:\n - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)\n\n Examples:\n Generate instructions based on a given input:\n\n ```python\n from distilabel.steps.tasks import SelfInstruct\n from distilabel.models import InferenceEndpointsLLM\n\n self_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n )\n\n self_instruct.load()\n\n result = next(self_instruct.process([{\"input\": \"instruction\"}]))\n # result\n # [\n # {\n # 'input': 'instruction',\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n # }\n # ]\n ```\n\n Citations:\n ```\n @misc{wang2023selfinstructaligninglanguagemodels,\n title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n year={2023},\n eprint={2212.10560},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2212.10560},\n }\n ```\n \"\"\"\n\n num_instructions: int = 5\n criteria_for_query_generation: str = (\n \"Incorporate a diverse range of verbs, avoiding repetition.\\n\"\n \"Ensure queries are compatible with AI model's text generation functions and are limited to 1-2 sentences.\\n\"\n \"Design queries to be self-contained and standalone.\\n\"\n 'Blend interrogative (e.g., \"What is the significance of x?\") and imperative (e.g., \"Detail the process of x.\") styles.'\n )\n application_description: str = \"AI assistant\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"self-instruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `input` i.e. seed text.\"\"\"\n return [\"input\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n input=input[\"input\"],\n application_description=self.application_description,\n criteria_for_query_generation=self.criteria_for_query_generation,\n num_instructions=self.num_instructions,\n ),\n }\n ]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"instructions\", \"model_name\"]\n\n def format_output(\n self,\n output: Union[str, None],\n input: Optional[Dict[str, Any]] = None,\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the generated instructions.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with containing the generated instructions.\n \"\"\"\n if output is None:\n return {\"instructions\": []}\n return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.inputs","title":"inputs: List[str] property ","text":"The input for the task is the input i.e. seed text. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.outputs","title":"outputs property ","text":"The output for the task is a list of instructions containing the generated instructions. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/self_instruct.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"self-instruct.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/self_instruct.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n input=input[\"input\"],\n application_description=self.application_description,\n criteria_for_query_generation=self.criteria_for_query_generation,\n num_instructions=self.num_instructions,\n ),\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a list with the generated instructions. Parameters: Name Type Description Default output Union[str, None] the raw output of the LLM. required input Optional[Dict[str, Any]] the input to the task. Used for obtaining the number of responses. None Returns: Type Description Dict[str, Any] A dict with containing the generated instructions. Source code in src/distilabel/steps/tasks/self_instruct.py def format_output(\n self,\n output: Union[str, None],\n input: Optional[Dict[str, Any]] = None,\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a list with the generated instructions.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Used for obtaining the number of responses.\n\n Returns:\n A dict with containing the generated instructions.\n \"\"\"\n if output is None:\n return {\"instructions\": []}\n return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair","title":"GenerateSentencePair ","text":" Bases: Task Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models. Attributes: Name Type Description triplet bool a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False . action GenerationAction the action to perform to generate the positive sentence. context str the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default. hard_negative bool A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity. Input columns - anchor (
str ): The anchor sentence to generate the positive and negative sentences. Output columns - positive (
str ): The positive sentence related to the anchor . - negative (
str ): The negative sentence unrelated to the anchor if triplet=True , or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True . - model_name (
str ): The name of the model that was used to generate the sentences. Categories Examples: Paraphrasing: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n Generating semantically similar sentences: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n Generating queries: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n Generating answers: from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n Generating queries with context (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Generating Hard-negatives (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Generating structured data with default schema (applies to every action): from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n Source code in src/distilabel/steps/tasks/sentence_transformers.py class GenerateSentencePair(Task):\n \"\"\"Generate a positive and negative (optionally) sentences given an anchor sentence.\n\n `GenerateSentencePair` is a pre-defined task that given an anchor sentence generates\n a positive sentence related to the anchor and optionally a negative sentence unrelated\n to the anchor or similar to it. Optionally, you can give a context to guide the LLM\n towards more specific behavior. This task is useful to generate training datasets for\n training embeddings models.\n\n Attributes:\n triplet: a flag to indicate if the task should generate a triplet of sentences\n (anchor, positive, negative). Defaults to `False`.\n action: the action to perform to generate the positive sentence.\n context: the context to use for the generation. Can be helpful to guide the LLM\n towards more specific context. Not used by default.\n hard_negative: A flag to indicate if the negative should be a hard-negative or not.\n Hard negatives make it hard for the model to distinguish against the positive,\n with a higher degree of semantic similarity.\n\n Input columns:\n - anchor (`str`): The anchor sentence to generate the positive and negative sentences.\n\n Output columns:\n - positive (`str`): The positive sentence related to the `anchor`.\n - negative (`str`): The negative sentence unrelated to the `anchor` if `triplet=True`,\n or more similar to the positive to make it more challenging for a model to distinguish\n in case `hard_negative=True`.\n - model_name (`str`): The name of the model that was used to generate the sentences.\n\n Categories:\n - embedding\n\n Examples:\n Paraphrasing:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n ```\n\n Generating semantically similar sentences:\n\n ```python\n from distilabel.models import InferenceEndpointsLLM\n from distilabel.steps.tasks import GenerateSentencePair\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n ```\n\n Generating queries:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n ```\n\n Generating answers:\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n ```\n\n Generating queries with context (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n\n Generating Hard-negatives (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n\n Generating structured data with default schema (**applies to every action**):\n\n ```python\n from distilabel.steps.tasks import GenerateSentencePair\n from distilabel.models import InferenceEndpointsLLM\n\n generate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n )\n\n generate_sentence_pair.load()\n\n result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n ```\n \"\"\"\n\n triplet: bool = False\n action: GenerationAction\n hard_negative: bool = False\n context: str = \"\"\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"generate-sentence-pair.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs for the task is the `anchor` sentence.\"\"\"\n return [\"anchor\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n task of generating a positive and negative sentences for the anchor sentence. The\n anchor is provided as the first user interaction in the conversation.\n\n Args:\n input: The input containing the `anchor` sentence.\n\n Returns:\n A list of dictionaries containing the system and user interactions.\n \"\"\"\n action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n format_system_prompt = {\n \"action_sentence\": action_sentence,\n \"context\": CONTEXT_INTRO if self.context else \"\",\n }\n if self.triplet:\n format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n \"hard-negative\" if self.hard_negative else \"negative\"\n ]\n\n system_prompt = (\n POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n ).format(**format_system_prompt)\n\n return [\n {\"role\": \"system\", \"content\": system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n anchor=input[\"anchor\"],\n context=self.context if self.context else None,\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The outputs for the task are the `positive` and `negative` sentences, as well\n as the `model_name` used to generate the sentences.\"\"\"\n columns = [\"positive\", \"negative\"] if self.triplet else [\"positive\"]\n columns += [\"model_name\"]\n return columns\n\n def format_output(\n self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n ) -> Dict[str, Any]:\n \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n generated. If the output is `None` or the regex doesn't match, then the outputs\n will be set to `None` as well.\n\n Args:\n output: The output of the LLM.\n input: The input used to generate the output.\n\n Returns:\n The formatted output containing the `positive` and `negative` sentences.\n \"\"\"\n if output is None:\n return {\"positive\": None, \"negative\": None}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output)\n\n match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n if match is None:\n formatted_output = {\"positive\": None}\n if self.triplet:\n formatted_output[\"negative\"] = None\n return formatted_output\n\n groups = match.groups()\n if self.triplet:\n return {\n \"positive\": groups[0].strip(),\n \"negative\": (\n groups[1].strip()\n if len(groups) > 1 and groups[1] is not None\n else None\n ),\n }\n\n return {\"positive\": groups[0].strip()}\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.triplet:\n return {\n \"properties\": {\n \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n },\n \"required\": [\"positive\", \"negative\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n \"required\": [\"positive\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(self, output: str) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.triplet:\n return {\"positive\": None, \"negative\": None}\n return {\"positive\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.inputs","title":"inputs: List[str] property ","text":"The inputs for the task is the anchor sentence. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.outputs","title":"outputs: List[str] property ","text":"The outputs for the task are the positive and negative sentences, as well as the model_name used to generate the sentences. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.load","title":"load() ","text":"Loads the Jinja2 template. Source code in src/distilabel/steps/tasks/sentence_transformers.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"generate-sentence-pair.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_input","title":"format_input(input) ","text":"The inputs are formatted as a ChatType , with a system prompt describing the task of generating a positive and negative sentences for the anchor sentence. The anchor is provided as the first user interaction in the conversation. Parameters: Name Type Description Default input Dict[str, Any] The input containing the anchor sentence. required Returns: Type Description ChatType A list of dictionaries containing the system and user interactions. Source code in src/distilabel/steps/tasks/sentence_transformers.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n task of generating a positive and negative sentences for the anchor sentence. The\n anchor is provided as the first user interaction in the conversation.\n\n Args:\n input: The input containing the `anchor` sentence.\n\n Returns:\n A list of dictionaries containing the system and user interactions.\n \"\"\"\n action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n format_system_prompt = {\n \"action_sentence\": action_sentence,\n \"context\": CONTEXT_INTRO if self.context else \"\",\n }\n if self.triplet:\n format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n \"hard-negative\" if self.hard_negative else \"negative\"\n ]\n\n system_prompt = (\n POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n ).format(**format_system_prompt)\n\n return [\n {\"role\": \"system\", \"content\": system_prompt},\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n anchor=input[\"anchor\"],\n context=self.context if self.context else None,\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_output","title":"format_output(output, input=None) ","text":"Formats the output of the LLM, to extract the positive and negative sentences generated. If the output is None or the regex doesn't match, then the outputs will be set to None as well. Parameters: Name Type Description Default output Union[str, None] The output of the LLM. required input Optional[Dict[str, Any]] The input used to generate the output. None Returns: Type Description Dict[str, Any] The formatted output containing the positive and negative sentences. Source code in src/distilabel/steps/tasks/sentence_transformers.py def format_output(\n self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n) -> Dict[str, Any]:\n \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n generated. If the output is `None` or the regex doesn't match, then the outputs\n will be set to `None` as well.\n\n Args:\n output: The output of the LLM.\n input: The input used to generate the output.\n\n Returns:\n The formatted output containing the `positive` and `negative` sentences.\n \"\"\"\n if output is None:\n return {\"positive\": None, \"negative\": None}\n\n if self.use_default_structured_output:\n return self._format_structured_output(output)\n\n match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n if match is None:\n formatted_output = {\"positive\": None}\n if self.triplet:\n formatted_output[\"negative\"] = None\n return formatted_output\n\n groups = match.groups()\n if self.triplet:\n return {\n \"positive\": groups[0].strip(),\n \"negative\": (\n groups[1].strip()\n if len(groups) > 1 and groups[1] is not None\n else None\n ),\n }\n\n return {\"positive\": groups[0].strip()}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/sentence_transformers.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.triplet:\n return {\n \"properties\": {\n \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n },\n \"required\": [\"positive\", \"negative\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n \"required\": [\"positive\"],\n \"title\": \"Schema\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair._format_structured_output","title":"_format_structured_output(output) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, str] Formatted output. Source code in src/distilabel/steps/tasks/sentence_transformers.py def _format_structured_output(self, output: str) -> Dict[str, str]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.triplet:\n return {\"positive\": None, \"negative\": None}\n return {\"positive\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration","title":"StructuredGeneration ","text":" Bases: Task Generate structured content for a given instruction using an LLM . StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction . The model_name also returned as part of the output in order to enhance it. Attributes: Name Type Description use_system_prompt bool Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. Input columns - instruction (
str ): The instruction to generate structured content from. - structured_output (
Dict[str, Any] ): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema , where format should be one of json or regex , and the schema should be either the JSON schema or the regex pattern, respectively. Output columns - generation (
str ): The generated text matching the provided schema, if possible. - model_name (
str ): The name of the model used to generate the text. Categories - outlines
- structured-generation
Examples: Generate structured output from a JSON schema: from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n)\n Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines): from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n)\n Source code in src/distilabel/steps/tasks/structured_generation.py class StructuredGeneration(Task):\n \"\"\"Generate structured content for a given `instruction` using an `LLM`.\n\n `StructuredGeneration` is a pre-defined task that defines the `instruction` and the `structured_output`\n as the inputs, and `generation` as the output. This task is used to generate structured content based on\n the input instruction and following the schema provided within the `structured_output` column per each\n `instruction`. The `model_name` also returned as part of the output in order to enhance it.\n\n Attributes:\n use_system_prompt: Whether to use the system prompt in the generation. Defaults to `True`,\n which means that if the column `system_prompt` is defined within the input batch, then\n the `system_prompt` will be used, otherwise, it will be ignored.\n\n Input columns:\n - instruction (`str`): The instruction to generate structured content from.\n - structured_output (`Dict[str, Any]`): The structured_output to generate structured content from. It should be a\n Python dictionary with the keys `format` and `schema`, where `format` should be one of `json` or\n `regex`, and the `schema` should be either the JSON schema or the regex pattern, respectively.\n\n Output columns:\n - generation (`str`): The generated text matching the provided schema, if possible.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - outlines\n - structured-generation\n\n Examples:\n Generate structured output from a JSON schema:\n\n ```python\n from distilabel.steps.tasks import StructuredGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n structured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n )\n\n structured_gen.load()\n\n result = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n )\n ```\n\n Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):\n\n ```python\n from distilabel.steps.tasks import StructuredGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n structured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n )\n\n structured_gen.load()\n\n result = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n )\n ```\n \"\"\"\n\n use_system_prompt: bool = False\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `instruction` and the `structured_output`.\n Optionally, if the `use_system_prompt` flag is set to True, then the\n `system_prompt` will be used too.\"\"\"\n columns = [\"instruction\", \"structured_output\"]\n if self.use_system_prompt:\n columns = [\"system_prompt\"] + columns\n return columns\n\n def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/structuredgeneration/\",\n )\n\n messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if self.use_system_prompt:\n if \"system_prompt\" in input:\n messages.insert(\n 0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n )\n else:\n warnings.warn(\n \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n UserWarning,\n stacklevel=2,\n )\n\n return (messages, input.get(\"structured_output\", None)) # type: ignore\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`. Note that even\n if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n output i.e. a string without any parsing.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.inputs","title":"inputs: List[str] property ","text":"The input for the task are the instruction and the structured_output . Optionally, if the use_system_prompt flag is set to True, then the system_prompt will be used too. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/structured_generation.py def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/structuredgeneration/\",\n )\n\n messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if self.use_system_prompt:\n if \"system_prompt\" in input:\n messages.insert(\n 0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n )\n else:\n warnings.warn(\n \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n UserWarning,\n stacklevel=2,\n )\n\n return (messages, input.get(\"structured_output\", None)) # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_output","title":"format_output(output, input) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Note that even if the structured_output is defined to produce a JSON schema, this method will return the raw output i.e. a string without any parsing. Source code in src/distilabel/steps/tasks/structured_generation.py def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`. Note that even\n if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n output i.e. a string without any parsing.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification","title":"TextClassification ","text":" Bases: Task Classifies text into one or more categories or labels. This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference. Input columns - text (
str ): The reference text we want to obtain labels for. Output columns - labels (
Union[str, List[str]] ): The label or list of labels for the text. - model_name (
str ): The name of the model used to generate the label/s. Categories References Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models Attributes: Name Type Description system_prompt Optional[str] A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist. n PositiveInt Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1. context Optional[str] Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task. examples Optional[List[str]] List of examples to help the model understand the task, few shots. available_labels Optional[Union[List[str], Dict[str, str]]] List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions. default_label Optional[Union[str, List[str]]] Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1). Examples: Assigning a sentiment to a text: from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Assigning predefined labels with specified descriptions: from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Free multi label classification without predefined labels: from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Source code in src/distilabel/steps/tasks/text_classification.py class TextClassification(Task):\n r\"\"\"Classifies text into one or more categories or labels.\n\n This task can be used for text classification problems, where the goal is to assign\n one or multiple labels to a given text.\n It uses structured generation as per the reference paper by default,\n it can help to generate more concise labels. See section 4.1 in the reference.\n\n Input columns:\n - text (`str`): The reference text we want to obtain labels for.\n\n Output columns:\n - labels (`Union[str, List[str]]`): The label or list of labels for the text.\n - model_name (`str`): The name of the model used to generate the label/s.\n\n Categories:\n - text-classification\n\n References:\n - [`Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models`](https://arxiv.org/abs/2408.02442)\n\n Attributes:\n system_prompt: A prompt to display to the user before the task starts. Contains a default\n message to make the model behave like a classifier specialist.\n n: Number of labels to generate If only 1 is required, corresponds to a label\n classification problem, if >1 it will intend return the \"n\" labels most representative\n for the text. Defaults to 1.\n context: Context to use when generating the labels. By default contains a generic message,\n but can be used to customize the context for the task.\n examples: List of examples to help the model understand the task, few shots.\n available_labels: List of available labels to choose from when classifying the text, or\n a dictionary with the labels and their descriptions.\n default_label: Default label to use when the text is ambiguous or lacks sufficient information for\n classification. Can be a list in case of multiple labels (n>1).\n\n Examples:\n Assigning a sentiment to a text:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n from distilabel.models import InferenceEndpointsLLM\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n\n text_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n )\n # result\n # [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n # 'labels': 'positive',\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Assigning predefined labels with specified descriptions:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n\n text_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n )\n # result\n # [{'text': 'Can you tell me more about your return policy?',\n # 'labels': 'inquiry',\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Free multi label classification without predefined labels:\n\n ```python\n from distilabel.steps.tasks import TextClassification\n\n text_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n )\n\n text_classification.load()\n\n result = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n )\n # result\n # [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n # 'labels': ['Historical Researcher',\n # 'Cultural Specialist',\n # 'Ethnic Studies Expert'],\n # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n # 'raw_input_text_classification_0': [{'role': 'system',\n # 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n # {'role': 'user',\n # 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n \"\"\"\n\n system_prompt: Optional[str] = (\n \"You are an AI system specialized in generating labels to classify pieces of text. \"\n \"Your sole purpose is to analyze the given text and provide appropriate classification labels.\"\n )\n n: PositiveInt = Field(\n default=1,\n description=\"Number of labels to generate. Defaults to 1.\",\n )\n context: Optional[str] = Field(\n default=\"Generate concise, relevant labels that accurately represent the text's main themes, topics, or categories.\",\n description=\"Context to use when generating the labels.\",\n )\n examples: Optional[List[str]] = Field(\n default=None,\n description=\"List of examples to help the model understand the task, few shots.\",\n )\n available_labels: Optional[Union[List[str], Dict[str, str]]] = Field(\n default=None,\n description=(\n \"List of available labels to choose from when classifying the text, or \"\n \"a dictionary with the labels and their descriptions.\"\n ),\n )\n default_label: Optional[Union[str, List[str]]] = Field(\n default=\"Unclassified\",\n description=(\n \"Default label to use when the text is ambiguous or lacks sufficient information for \"\n \"classification. Can be a list in case of multiple labels (n>1).\"\n ),\n )\n query_title: str = Field(\n default=\"User Query\",\n description=\"Title of the query used to show the example/s to classify.\",\n )\n use_default_structured_output: bool = True\n\n _template: Optional[Template] = PrivateAttr(default=None)\n\n def load(self) -> None:\n super().load()\n self._template = Template(TEXT_CLASSIFICATION_TEMPLATE)\n self._labels_format: str = (\n '\"label\"'\n if self.n == 1\n else \"[\" + \", \".join([f'\"label_{i}\"' for i in range(self.n)]) + \"]\"\n )\n self._labels_message: str = (\n \"Provide the label that best describes the text.\"\n if self.n == 1\n else f\"Provide a list of {self.n} labels that best describe the text.\"\n )\n self._available_labels_message: str = self._get_available_labels_message()\n self._examples: str = self._get_examples_message()\n\n def _get_available_labels_message(self) -> str:\n \"\"\"Prepares the message to display depending on the available labels (if any),\n and whether the labels have a specific context.\n \"\"\"\n if self.available_labels is None:\n return (\n \"Use clear, widely understood terms for labels.\"\n \"Avoid overly specific or obscure labels unless the text demands it.\"\n )\n\n msg = (\n \"## Labeling the user input\\n\"\n \"Use the available labels to classify the user query{label_context}:\\n\"\n \"available_labels = {available_labels}\"\n )\n if isinstance(self.available_labels, list):\n specific_msg = (\n \"[\\n\"\n + indent(\n \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n prefix=\" \" * 4,\n )\n + \"]\"\n )\n return msg.format(label_context=\"\", available_labels=specific_msg)\n\n elif isinstance(self.available_labels, dict):\n specific_msg = \"\"\n for label, description in self.available_labels.items():\n specific_msg += indent(\n f'\"{label}\", # {description}' + \"\\n\", prefix=\" \" * 4\n )\n\n specific_msg = \"[\\n\" + specific_msg + \"]\"\n return msg.format(\n label_context=\". Analyze the context of each label specifically\",\n available_labels=specific_msg,\n )\n\n def _get_examples_message(self) -> str:\n \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n if self.examples is None:\n return \"\"\n\n examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n return (\n \"\\n## Examples\\n\"\n \"Here are some examples to help you understand the task:\\n\"\n f\"{examples_msg}\"\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"labels\", \"model_name\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n context=f\"\\n{self.context}\",\n labels_message=self._labels_message,\n available_labels=self._available_labels_message,\n examples=self._examples,\n default_label=self.default_label,\n labels_format=self._labels_format,\n query_title=self.query_title,\n text=input[\"text\"],\n ),\n },\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return self._format_structured_output(output)\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.n > 1:\n\n class MultiLabelSchema(BaseModel):\n labels: List[str]\n\n return MultiLabelSchema.model_json_schema()\n\n class SingleLabelSchema(BaseModel):\n labels: str\n\n return SingleLabelSchema.model_json_schema()\n\n def _format_structured_output(\n self, output: str\n ) -> Dict[str, Union[str, List[str]]]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the `labels`, and either a string or a list of strings with the labels.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.n > 1:\n return {\"labels\": [None for _ in range(self.n)]}\n return {\"labels\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_available_labels_message","title":"_get_available_labels_message() ","text":"Prepares the message to display depending on the available labels (if any), and whether the labels have a specific context. Source code in src/distilabel/steps/tasks/text_classification.py def _get_available_labels_message(self) -> str:\n \"\"\"Prepares the message to display depending on the available labels (if any),\n and whether the labels have a specific context.\n \"\"\"\n if self.available_labels is None:\n return (\n \"Use clear, widely understood terms for labels.\"\n \"Avoid overly specific or obscure labels unless the text demands it.\"\n )\n\n msg = (\n \"## Labeling the user input\\n\"\n \"Use the available labels to classify the user query{label_context}:\\n\"\n \"available_labels = {available_labels}\"\n )\n if isinstance(self.available_labels, list):\n specific_msg = (\n \"[\\n\"\n + indent(\n \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n prefix=\" \" * 4,\n )\n + \"]\"\n )\n return msg.format(label_context=\"\", available_labels=specific_msg)\n\n elif isinstance(self.available_labels, dict):\n specific_msg = \"\"\n for label, description in self.available_labels.items():\n specific_msg += indent(\n f'\"{label}\", # {description}' + \"\\n\", prefix=\" \" * 4\n )\n\n specific_msg = \"[\\n\" + specific_msg + \"]\"\n return msg.format(\n label_context=\". Analyze the context of each label specifically\",\n available_labels=specific_msg,\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_examples_message","title":"_get_examples_message() ","text":"Prepares the message to display depending on the examples provided. Source code in src/distilabel/steps/tasks/text_classification.py def _get_examples_message(self) -> str:\n \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n if self.examples is None:\n return \"\"\n\n examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n return (\n \"\\n## Examples\\n\"\n \"Here are some examples to help you understand the task:\\n\"\n f\"{examples_msg}\"\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_classification.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = [\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n context=f\"\\n{self.context}\",\n labels_message=self._labels_message,\n available_labels=self._available_labels_message,\n examples=self._examples,\n default_label=self.default_label,\n labels_format=self._labels_format,\n query_title=self.query_title,\n text=input[\"text\"],\n ),\n },\n ]\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n return messages\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_classification.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return self._format_structured_output(output)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/text_classification.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.n > 1:\n\n class MultiLabelSchema(BaseModel):\n labels: List[str]\n\n return MultiLabelSchema.model_json_schema()\n\n class SingleLabelSchema(BaseModel):\n labels: str\n\n return SingleLabelSchema.model_json_schema()\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._format_structured_output","title":"_format_structured_output(output) ","text":"Parses the structured response, which should correspond to a dictionary with the labels , and either a string or a list of strings with the labels. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, Union[str, List[str]]] Formatted output. Source code in src/distilabel/steps/tasks/text_classification.py def _format_structured_output(\n self, output: str\n) -> Dict[str, Union[str, List[str]]]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with the `labels`, and either a string or a list of strings with the labels.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.n > 1:\n return {\"labels\": [None for _ in range(self.n)]}\n return {\"labels\": None}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration","title":"ChatGeneration ","text":" Bases: Task Generates text based on a conversation. ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it. Input columns - messages (
List[Dict[Literal[\"role\", \"content\"], str]] ): The messages to generate the follow up completion from. Output columns - generation (
str ): The generated text from the assistant. - model_name (
str ): The model name used to generate the text. Categories Icon :material-chat: Examples: Generate text from a conversation in OpenAI chat format: from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nchat.load()\n\nresult = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'generation': '4',\n# }\n# ]\n Source code in src/distilabel/steps/tasks/text_generation.py class ChatGeneration(Task):\n \"\"\"Generates text based on a conversation.\n\n `ChatGeneration` is a pre-defined task that defines the `messages` as the input\n and `generation` as the output. This task is used to generate text based on a conversation.\n The `model_name` is also returned as part of the output in order to enhance it.\n\n Input columns:\n - messages (`List[Dict[Literal[\"role\", \"content\"], str]]`): The messages to generate the\n follow up completion from.\n\n Output columns:\n - generation (`str`): The generated text from the assistant.\n - model_name (`str`): The model name used to generate the text.\n\n Categories:\n - chat-generation\n\n Icon:\n `:material-chat:`\n\n Examples:\n Generate text from a conversation in OpenAI chat format:\n\n ```python\n from distilabel.steps.tasks import ChatGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n chat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n )\n\n chat.load()\n\n result = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # 'generation': '4',\n # }\n # ]\n ```\n \"\"\"\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task are the `messages`.\"\"\"\n return [\"messages\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n if not is_openai_format(input[\"messages\"]):\n raise DistilabelUserError(\n \"Input `messages` must be an OpenAI chat-like format conversation. \"\n f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n if input[\"messages\"][-1][\"role\"] != \"user\":\n raise DistilabelUserError(\n \"The last message must be from the user. Please check: \"\n \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n return input[\"messages\"]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.inputs","title":"inputs: List[str] property ","text":"The input for the task are the messages . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the messages provided are already formatted that way i.e. following the OpenAI chat format. Source code in src/distilabel/steps/tasks/text_generation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n if not is_openai_format(input[\"messages\"]):\n raise DistilabelUserError(\n \"Input `messages` must be an OpenAI chat-like format conversation. \"\n f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n if input[\"messages\"][-1][\"role\"] != \"user\":\n raise DistilabelUserError(\n \"The last message must be from the user. Please check: \"\n \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n page=\"components-gallery/tasks/chatgeneration/\",\n )\n\n return input[\"messages\"]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_generation.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration","title":"TextGeneration ","text":" Bases: Task Text generation with an LLM given a prompt. TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM. Attributes: Name Type Description system_prompt Union[str, None] The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None . template str The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. columns Union[str, List[str]] A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . use_system_prompt bool DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. Input columns - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a List[str] to be used in the template. Output columns - generation (
str ): The generated text. - model_name (
str ): The name of the model used to generate the text. Categories References - Jinja2 Template Designer Documentation
Examples: Generate text from an instruction: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'your instruction',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'generation',\n# }\n# ]\n Use a custom template to generate text: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n# 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n# }\n# ]\n Few shot learning with different system prompts: from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'examples': ['This is an example', 'Another relevant example'],\n# 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'Disable the firewall on the router',\n# }\n# ]\n Source code in src/distilabel/steps/tasks/text_generation.py class TextGeneration(Task):\n \"\"\"Text generation with an `LLM` given a prompt.\n\n `TextGeneration` is a pre-defined task that allows passing a custom prompt using the\n Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n `template` and `columns` attributes one can define a custom prompt and columns expected\n from the text. This task should be good enough for tasks that don't need post-processing\n of the responses generated by the LLM.\n\n Attributes:\n system_prompt: The system prompt to use in the generation. If not provided, then\n it will check if the input row has a column named `system_prompt` and use it.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system\n prompt in the generation. Defaults to `True`, which means that if the column\n `system_prompt` is defined within the input batch, then the `system_prompt`\n will be used, otherwise, it will be ignored.\n\n Input columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `List[str]` to be used in the template.\n\n Output columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - text-generation\n\n References:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n\n Examples:\n Generate text from an instruction:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'your instruction',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'generation',\n # }\n # ]\n ```\n\n Use a custom template to generate text:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n CUSTOM_TEMPLATE = '''Document:\n {{ document }}\n\n Question: {{ question }}\n\n Please provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n '''.rstrip()\n\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n # 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n # }\n # ]\n ```\n\n Few shot learning with different system prompts:\n\n ```python\n from distilabel.steps.tasks import TextGeneration\n from distilabel.models import InferenceEndpointsLLM\n\n CUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n {% for example in examples %}\n Example {{ loop.index }}:\n Instruction: {{ example }}\n\n {% endfor %}\n Now, generate a new instruction in a similar style:\n '''.rstrip()\n\n text_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n )\n\n text_gen.load()\n\n result = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'examples': ['This is an example', 'Another relevant example'],\n # 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n # 'generation': 'Disable the firewall on the router',\n # }\n # ]\n ```\n \"\"\"\n\n system_prompt: Union[str, None] = None\n use_system_prompt: bool = Field(default=True, deprecated=True)\n template: str = Field(\n default=\"{{ instruction }}\",\n description=(\n \"This is a template or prompt to use for the generation. \"\n \"If not provided, it is assumed a `instruction` is placed in the inputs, \"\n \"to be used as is.\"\n ),\n )\n columns: Union[str, List[str]] = Field(\n default=\"instruction\",\n description=(\n \"Custom column or list of columns to include in the input. \"\n \"If a `template` is provided which needs custom column names, \"\n \"then they should be provided here. By default it will use `instruction`.\"\n ),\n )\n\n _can_be_used_with_offline_batch_generation = True\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n\n def model_post_init(self, __context: Any) -> None:\n self.columns = [self.columns] if isinstance(self.columns, str) else self.columns\n super().model_post_init(__context)\n\n def load(self) -> None:\n super().load()\n\n for column in self.columns:\n check_column_in_template(column, self.template)\n\n self._template = Template(self.template)\n\n def unload(self) -> None:\n super().unload()\n self._template = None\n\n @property\n def inputs(self) -> \"StepColumns\":\n \"\"\"The input for the task is the `instruction` by default, or the `columns` given as input.\"\"\"\n columns = {column: True for column in self.columns}\n columns[\"system_prompt\"] = False\n return columns\n\n def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n # Handle the previous expected errors, in case of custom columns there's more freedom\n # and we cannot check it so easily.\n if self.columns == [\"instruction\"]:\n if is_openai_format(input[\"instruction\"]):\n raise DistilabelUserError(\n \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n messages = self._prepare_message_content(input)\n\n row_system_prompt = input.get(\"system_prompt\")\n if row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n if self.system_prompt and not row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.inputs","title":"inputs: StepColumns property ","text":"The input for the task is the instruction by default, or the columns given as input. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration._prepare_message_content","title":"_prepare_message_content(input) ","text":"Prepares the content for the template and returns the formatted messages. Source code in src/distilabel/steps/tasks/text_generation.py def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_generation.py def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n # Handle the previous expected errors, in case of custom columns there's more freedom\n # and we cannot check it so easily.\n if self.columns == [\"instruction\"]:\n if is_openai_format(input[\"instruction\"]):\n raise DistilabelUserError(\n \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n if not isinstance(input[\"instruction\"], str):\n raise DistilabelUserError(\n f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n page=\"components-gallery/tasks/textgeneration/\",\n )\n\n messages = self._prepare_message_content(input)\n\n row_system_prompt = input.get(\"system_prompt\")\n if row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n if self.system_prompt and not row_system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the generation . The model_name will be automatically included within the process method of Task . Source code in src/distilabel/steps/tasks/text_generation.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n will be automatically included within the `process` method of `Task`.\"\"\"\n return {\"generation\": output}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage","title":"TextGenerationWithImage ","text":" Bases: TextGeneration Text generation with images with an LLM given a prompt. `TextGenerationWithImage` is a pre-defined task that allows passing a custom prompt using the\nJinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n`template` and `columns` attributes one can define a custom prompt and columns expected\nfrom the text. Additionally, an `image` column is expected containing one of the\nurl, base64 encoded image or PIL image. This task inherits from `TextGeneration`,\nso all the functionality available in that task related to the prompt will be available\nhere too.\n\nAttributes:\n system_prompt: The system prompt to use in the generation.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n image_type: The type of the image provided, this will be used to preprocess if necessary.\n Must be one of \"url\", \"base64\" or \"PIL\".\n\nInput columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `list[str]` to be used in the template.\n - image: The column containing the image URL, base64 encoded image or PIL image.\n\nOutput columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\nCategories:\n - text-generation\n\nReferences:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n - [Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text)\n - [OpenAI Vision](https://platform.openai.com/docs/guides/vision)\n\nExamples:\n Answer questions from an image:\n\n ```python\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"What\u2019s in this image?\",\n # \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n # \"generation\": \"Based on the visual cues in the image...\",\n # \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n # ... # distilabel_metadata would be here\n # }\n # ]\n # result[0][\"generation\"]\n # \"Based on the visual cues in the image, here are some possible story points:\n - The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.
Analysis and Ideas: * The abundance of green grass and trees suggests a healthy ecosystem or habitat. * The presence of wildlife, such as birds or deer, is possible based on the surroundings. * A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest. Additional Questions to Ask: * Why is a footbridge present in this area? * What kind of wildlife inhabits this region\" Answer questions from an image stored as base64:\n\n```python\n# For this example we will assume that we have the string representation of the image\n# stored, but will just take the image and transform it to base64 to ilustrate the example.\nimport requests\nimport base64\n\nimage_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimg = requests.get(image_url).content\nbase64_image = base64.b64encode(img).decode(\"utf-8\")\n\nfrom distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n)\n Source code in src/distilabel/steps/tasks/text_generation_with_image.py class TextGenerationWithImage(TextGeneration):\n \"\"\"Text generation with images with an `LLM` given a prompt.\n\n `TextGenerationWithImage` is a pre-defined task that allows passing a custom prompt using the\n Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n `template` and `columns` attributes one can define a custom prompt and columns expected\n from the text. Additionally, an `image` column is expected containing one of the\n url, base64 encoded image or PIL image. This task inherits from `TextGeneration`,\n so all the functionality available in that task related to the prompt will be available\n here too.\n\n Attributes:\n system_prompt: The system prompt to use in the generation.\n If not, then no system prompt will be used. Defaults to `None`.\n template: The template to use for the generation. It must follow the Jinja2 template\n syntax. If not provided, it will assume the text passed is an instruction and\n construct the appropriate template.\n columns: A string with the column, or a list with columns expected in the template.\n Take a look at the examples for more information. Defaults to `instruction`.\n image_type: The type of the image provided, this will be used to preprocess if necessary.\n Must be one of \"url\", \"base64\" or \"PIL\".\n\n Input columns:\n - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n The columns can point both to a `str` or a `list[str]` to be used in the template.\n - image: The column containing the image URL, base64 encoded image or PIL image.\n\n Output columns:\n - generation (`str`): The generated text.\n - model_name (`str`): The name of the model used to generate the text.\n\n Categories:\n - text-generation\n\n References:\n - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n - [Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text)\n - [OpenAI Vision](https://platform.openai.com/docs/guides/vision)\n\n Examples:\n Answer questions from an image:\n\n ```python\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n )\n # result\n # [\n # {\n # \"instruction\": \"What\\u2019s in this image?\",\n # \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n # \"generation\": \"Based on the visual cues in the image...\",\n # \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n # ... # distilabel_metadata would be here\n # }\n # ]\n # result[0][\"generation\"]\n # \"Based on the visual cues in the image, here are some possible story points:\\n\\n* The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.\\n\\nAnalysis and Ideas:\\n* The abundance of green grass and trees suggests a healthy ecosystem or habitat.\\n* The presence of wildlife, such as birds or deer, is possible based on the surroundings.\\n* A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest.\\n\\nAdditional Questions to Ask:\\n* Why is a footbridge present in this area?\\n* What kind of wildlife inhabits this region\"\n ```\n\n Answer questions from an image stored as base64:\n\n ```python\n # For this example we will assume that we have the string representation of the image\n # stored, but will just take the image and transform it to base64 to ilustrate the example.\n import requests\n import base64\n\n image_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\n img = requests.get(image_url).content\n base64_image = base64.b64encode(img).decode(\"utf-8\")\n\n from distilabel.steps.tasks import TextGenerationWithImage\n from distilabel.models.llms import InferenceEndpointsLLM\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n )\n\n vision.load()\n\n result = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n )\n ```\n \"\"\"\n\n image_type: Literal[\"url\", \"base64\", \"PIL\"] = Field(\n default=\"url\",\n description=\"The type of the image provided, this will be used to preprocess if necessary.\",\n )\n\n @property\n def inputs(self) -> \"StepColumns\":\n columns = super().inputs\n columns[\"image\"] = True\n return columns\n\n def load(self) -> None:\n Task.load(self)\n\n for column in self.columns:\n check_column_in_template(\n column, self.template, page=\"components-gallery/tasks/visiongeneration/\"\n )\n\n self._template = Template(self.template)\n\n def _transform_image(self, image: Union[str, \"Image\"]) -> str:\n \"\"\"Transforms the image based on the `image_type` attribute.\"\"\"\n if self.image_type == \"url\":\n return image\n\n if self.image_type == \"base64\":\n return f\"data:image/jpeg;base64,{image}\"\n\n # Othwerwise, it's a PIL image\n return f\"data:image/jpeg;base64,{image_to_str(image)}\"\n\n def _prepare_message_content(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n img_url = self._transform_image(input[\"image\"])\n return [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"text\",\n \"text\": self._template.render(**fields),\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": img_url,\n },\n },\n ],\n }\n ]\n\n def format_input(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = self._prepare_message_content(input)\n\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage._transform_image","title":"_transform_image(image) ","text":"Transforms the image based on the image_type attribute. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def _transform_image(self, image: Union[str, \"Image\"]) -> str:\n \"\"\"Transforms the image based on the `image_type` attribute.\"\"\"\n if self.image_type == \"url\":\n return image\n\n if self.image_type == \"base64\":\n return f\"data:image/jpeg;base64,{image}\"\n\n # Othwerwise, it's a PIL image\n return f\"data:image/jpeg;base64,{image_to_str(image)}\"\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage._prepare_message_content","title":"_prepare_message_content(input) ","text":"Prepares the content for the template and returns the formatted messages. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def _prepare_message_content(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n fields = {column: input[column] for column in self.columns}\n img_url = self._transform_image(input[\"image\"])\n return [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"text\",\n \"text\": self._template.render(**fields),\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": img_url,\n },\n },\n ],\n }\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGenerationWithImage.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/text_generation_with_image.py def format_input(self, input: dict[str, Any]) -> \"ChatType\":\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n messages = self._prepare_message_content(input)\n\n if self.system_prompt:\n messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n return messages # type: ignore\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback","title":"UltraFeedback ","text":" Bases: Task Rank generations focusing on different aspects using an LLM . UltraFeedback: Boosting Language Models with High-quality Feedback. Attributes: Name Type Description aspect Literal['helpfulness', 'honesty', 'instruction-following', 'truthfulness', 'overall-rating'] The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness : Evaluate text outputs based on helpfulness. - honesty : Evaluate text outputs based on honesty. - instruction-following : Evaluate text outputs based on given instructions. - truthfulness : Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating : Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\" . Input columns - instruction (
str ): The reference instruction to evaluate the text outputs. - generations (
List[str] ): The text outputs to evaluate for the given instruction. Output columns - ratings (
List[float] ): The ratings for each of the provided text outputs. - rationales (
List[str] ): The rationales for each of the provided text outputs. - model_name (
str ): The name of the model used to generate the ratings and rationales. Categories References UltraFeedback: Boosting Language Models with High-quality Feedback UltraFeedback - GitHub Repository Examples: Rate generations from different LLMs based on the selected aspect: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 2],\n# 'rationales': ['explanation for 4', 'explanation for and a car'],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# }\n# ]\n Rate generations from different LLMs based on the honesty, using the default structured output: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n 5,\\n 1\\n] \\n\\n,\"rationales\": [\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Rate generations from different LLMs based on the helpfulness, using the default structured output: from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 5],\n# 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n# 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n# 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n# 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n# 'types': [1, 3, 1],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n \"ratings\": [\\n 1,\\n 5\\n ]\\n ,\\n \"rationales\": [\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n ]\\n ,\\n \"rationales_for_rating\": [\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n ]\\n ,\\n \"types\": [\\n 1, 3,\\n 1\\n ]\\n }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n Citations @misc{cui2024ultrafeedbackboostinglanguagemodels,\n title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n year={2024},\n eprint={2310.01377},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.01377},\n}\n Source code in src/distilabel/steps/tasks/ultrafeedback.py class UltraFeedback(Task):\n \"\"\"Rank generations focusing on different aspects using an `LLM`.\n\n UltraFeedback: Boosting Language Models with High-quality Feedback.\n\n Attributes:\n aspect: The aspect to perform with the `UltraFeedback` model. The available aspects are:\n - `helpfulness`: Evaluate text outputs based on helpfulness.\n - `honesty`: Evaluate text outputs based on honesty.\n - `instruction-following`: Evaluate text outputs based on given instructions.\n - `truthfulness`: Evaluate text outputs based on truthfulness.\n Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall\n assessment of the text outputs within a single prompt. The custom aspect is:\n - `overall-rating`: Evaluate text outputs based on an overall assessment.\n Defaults to `\"overall-rating\"`.\n\n Input columns:\n - instruction (`str`): The reference instruction to evaluate the text outputs.\n - generations (`List[str]`): The text outputs to evaluate for the given instruction.\n\n Output columns:\n - ratings (`List[float]`): The ratings for each of the provided text outputs.\n - rationales (`List[str]`): The rationales for each of the provided text outputs.\n - model_name (`str`): The name of the model used to generate the ratings and rationales.\n\n Categories:\n - preference\n\n References:\n - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)\n - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)\n\n Examples:\n Rate generations from different LLMs based on the selected aspect:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [\n # {\n # 'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [1, 2],\n # 'rationales': ['explanation for 4', 'explanation for and a car'],\n # 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n # }\n # ]\n ```\n\n Rate generations from different LLMs based on the honesty, using the default structured output:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [{'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [5, 1],\n # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n # \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\\\n 5,\\\\n 1\\\\n] \\\\n\\\\n,\"rationales\": [\\\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\\\n \"The response is confidently incorrect, as it provides unrelated information (\\'a car\\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\\\n] }'},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Rate generations from different LLMs based on the helpfulness, using the default structured output:\n\n ```python\n from distilabel.steps.tasks import UltraFeedback\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n ultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n )\n\n ultrafeedback.load()\n\n result = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n )\n # result\n # [{'instruction': 'How much is 2+2?',\n # 'generations': ['4', 'and a car'],\n # 'ratings': [1, 5],\n # 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n # 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n # 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n # 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n # 'types': [1, 3, 1],\n # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\\\n \"ratings\": [\\\\n 1,\\\\n 5\\\\n ]\\\\n ,\\\\n \"rationales\": [\\\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\\\n ]\\\\n ,\\\\n \"rationales_for_rating\": [\\\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\\\n ]\\\\n ,\\\\n \"types\": [\\\\n 1, 3,\\\\n 1\\\\n ]\\\\n }'},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n ```\n\n Citations:\n ```\n @misc{cui2024ultrafeedbackboostinglanguagemodels,\n title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n year={2024},\n eprint={2310.01377},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2310.01377},\n }\n ```\n \"\"\"\n\n aspect: Literal[\n \"helpfulness\",\n \"honesty\",\n \"instruction-following\",\n \"truthfulness\",\n # Custom aspects\n \"overall-rating\",\n ] = \"overall-rating\"\n\n _system_prompt: str = PrivateAttr(\n default=(\n \"Your role is to evaluate text quality based on given criteria.\\n\"\n 'You\\'ll receive an instructional description (\"Instruction\") and {no_texts} text outputs (\"Text\").\\n'\n \"Understand and interpret instructions to evaluate effectively.\\n\"\n \"Provide annotations for each text with a rating and rationale.\\n\"\n \"The {no_texts} texts given are independent, and should be evaluated separately.\\n\"\n )\n )\n _template: Optional[\"Template\"] = PrivateAttr(default=...)\n _can_be_used_with_offline_batch_generation = True\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"ultrafeedback\"\n / f\"{self.aspect}.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`, and the `generations` for it.\"\"\"\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._system_prompt.format(\n no_texts=len(input[\"generations\"])\n ),\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generations=input[\"generations\"]\n ),\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n columns = []\n if self.aspect in [\"honesty\", \"instruction-following\", \"overall-rating\"]:\n columns = [\"ratings\", \"rationales\"]\n elif self.aspect in [\"helpfulness\", \"truthfulness\"]:\n columns = [\"types\", \"rationales\", \"ratings\", \"rationales-for-ratings\"]\n return columns + [\"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n each of the provided `generations` for the given `instruction`. The `model_name`\n will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing either the `ratings` and `rationales` for each of the provided\n `generations` for the given `instruction` if the provided aspect is either `honesty`,\n `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n \"\"\"\n assert input is not None, \"Input is required to format the output.\"\n\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return self._format_ratings_rationales_output(output, input)\n\n return self._format_types_ratings_rationales_output(output, input)\n\n def _format_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n if output is None:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n continue\n\n formatted_outputs.append(\n {\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n }\n )\n return group_dicts(*formatted_outputs)\n\n def _format_types_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n if output is None:\n return {\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append(\n {\n \"types\": None,\n \"rationales\": None,\n \"ratings\": None,\n \"rationales-for-ratings\": None,\n }\n )\n continue\n\n formatted_outputs.append(\n {\n \"types\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n if matches.group(3) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales-for-ratings\": matches.group(4),\n }\n )\n return group_dicts(*formatted_outputs)\n\n @override\n def get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\n class SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"properties\": {\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"ratings\", \"rationales\"],\n \"title\": \"SchemaUltraFeedback\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\n \"types\": {\n \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n \"title\": \"Types\",\n \"type\": \"array\",\n },\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n \"rationales_for_rating\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales For Rating\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n \"title\": \"SchemaUltraFeedbackWithType\",\n \"type\": \"object\",\n }\n\n def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n ) -> Dict[str, Any]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n @override\n def _sample_input(self) -> ChatType:\n return self.format_input(\n {\n \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n \"generations\": [\n f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n ],\n }\n )\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.inputs","title":"inputs: List[str] property ","text":"The input for the task is the instruction , and the generations for it. "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.outputs","title":"outputs: List[str] property ","text":"The output for the task is the generation and the model_name . "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.load","title":"load() ","text":"Loads the Jinja2 template for the given aspect . Source code in src/distilabel/steps/tasks/ultrafeedback.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"ultrafeedback\"\n / f\"{self.aspect}.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_input","title":"format_input(input) ","text":"The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. Source code in src/distilabel/steps/tasks/ultrafeedback.py def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._system_prompt.format(\n no_texts=len(input[\"generations\"])\n ),\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render( # type: ignore\n instruction=input[\"instruction\"], generations=input[\"generations\"]\n ),\n },\n ]\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_output","title":"format_output(output, input=None) ","text":"The output is formatted as a dictionary with the ratings and rationales for each of the provided generations for the given instruction . The model_name will be automatically included within the process method of Task . Parameters: Name Type Description Default output Union[str, None] a string representing the output of the LLM via the process method. required input Union[Dict[str, Any], None] the input to the task, as required by some tasks to format the output. None Returns: Type Description Dict[str, Any] A dictionary containing either the ratings and rationales for each of the provided Dict[str, Any] generations for the given instruction if the provided aspect is either honesty , Dict[str, Any] instruction-following , or overall-rating ; or the types , rationales , Dict[str, Any] ratings , and rationales-for-ratings for each of the provided generations for the Dict[str, Any] given instruction if the provided aspect is either helpfulness or truthfulness . Source code in src/distilabel/steps/tasks/ultrafeedback.py def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n each of the provided `generations` for the given `instruction`. The `model_name`\n will be automatically included within the `process` method of `Task`.\n\n Args:\n output: a string representing the output of the LLM via the `process` method.\n input: the input to the task, as required by some tasks to format the output.\n\n Returns:\n A dictionary containing either the `ratings` and `rationales` for each of the provided\n `generations` for the given `instruction` if the provided aspect is either `honesty`,\n `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n \"\"\"\n assert input is not None, \"Input is required to format the output.\"\n\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return self._format_ratings_rationales_output(output, input)\n\n return self._format_types_ratings_rationales_output(output, input)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_ratings_rationales_output","title":"_format_ratings_rationales_output(output, input) ","text":"Formats the output when the aspect is either honesty , instruction-following , or overall-rating . Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n if output is None:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n continue\n\n formatted_outputs.append(\n {\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n }\n )\n return group_dicts(*formatted_outputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_types_ratings_rationales_output","title":"_format_types_ratings_rationales_output(output, input) ","text":"Formats the output when the aspect is either helpfulness or truthfulness . Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_types_ratings_rationales_output(\n self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n if output is None:\n return {\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n\n if self.use_default_structured_output:\n return self._format_structured_output(output, input)\n\n pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n sections = output.split(\"\\n\\n\")\n\n formatted_outputs = []\n for section in sections:\n matches = None\n if section is not None and section != \"\":\n matches = re.search(pattern, section, re.DOTALL)\n if not matches:\n formatted_outputs.append(\n {\n \"types\": None,\n \"rationales\": None,\n \"ratings\": None,\n \"rationales-for-ratings\": None,\n }\n )\n continue\n\n formatted_outputs.append(\n {\n \"types\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n if matches.group(1) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales\": matches.group(2),\n \"ratings\": (\n int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n if matches.group(3) not in [\"None\", \"N/A\"]\n else None\n ),\n \"rationales-for-ratings\": matches.group(4),\n }\n )\n return group_dicts(*formatted_outputs)\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.get_structured_output","title":"get_structured_output() ","text":"Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary. The schema corresponds to the following: from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\nclass SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n Returns: Type Description Dict[str, Any] JSON Schema of the response to enforce. Source code in src/distilabel/steps/tasks/ultrafeedback.py @override\ndef get_structured_output(self) -> Dict[str, Any]:\n \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n a dictionary with the output which can be directly parsed as a python dictionary.\n\n The schema corresponds to the following:\n\n ```python\n from pydantic import BaseModel\n from typing import List\n\n class SchemaUltraFeedback(BaseModel):\n ratings: List[int]\n rationales: List[str]\n\n class SchemaUltraFeedbackWithType(BaseModel):\n types: List[Optional[int]]\n ratings: List[int]\n rationales: List[str]\n rationales_for_rating: List[str]\n ```\n\n Returns:\n JSON Schema of the response to enforce.\n \"\"\"\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"properties\": {\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"ratings\", \"rationales\"],\n \"title\": \"SchemaUltraFeedback\",\n \"type\": \"object\",\n }\n return {\n \"properties\": {\n \"types\": {\n \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n \"title\": \"Types\",\n \"type\": \"array\",\n },\n \"ratings\": {\n \"items\": {\"type\": \"integer\"},\n \"title\": \"Ratings\",\n \"type\": \"array\",\n },\n \"rationales\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales\",\n \"type\": \"array\",\n },\n \"rationales_for_rating\": {\n \"items\": {\"type\": \"string\"},\n \"title\": \"Rationales For Rating\",\n \"type\": \"array\",\n },\n },\n \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n \"title\": \"SchemaUltraFeedbackWithType\",\n \"type\": \"object\",\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_structured_output","title":"_format_structured_output(output, input) ","text":"Parses the structured response, which should correspond to a dictionary with either positive , or positive and negative keys. Parameters: Name Type Description Default output str The output from the LLM . required Returns: Type Description Dict[str, Any] Formatted output. Source code in src/distilabel/steps/tasks/ultrafeedback.py def _format_structured_output(\n self, output: str, input: Dict[str, Any]\n) -> Dict[str, Any]:\n \"\"\"Parses the structured response, which should correspond to a dictionary\n with either `positive`, or `positive` and `negative` keys.\n\n Args:\n output: The output from the `LLM`.\n\n Returns:\n Formatted output.\n \"\"\"\n try:\n return orjson.loads(output)\n except orjson.JSONDecodeError:\n if self.aspect in [\n \"honesty\",\n \"instruction-following\",\n \"overall-rating\",\n ]:\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n }\n return {\n \"ratings\": [None] * len(input[\"generations\"]),\n \"rationales\": [None] * len(input[\"generations\"]),\n \"types\": [None] * len(input[\"generations\"]),\n \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n }\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL","title":"URIAL ","text":" Bases: Task Generates a response using a non-instruct fine-tuned model. URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input. Input columns - instruction (
str , optional): The instruction to generate a response from. - conversation (
List[Dict[str, str]] , optional): The conversation to generate a response from (the last message must be from the user). Output columns - generation (
str ): The generated response. - model_name (
str ): The name of the model used to generate the response. Categories References - The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
Examples: Generate text from an instruction: from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n)\n\nstep.load()\n\nresults = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n# {\n# 'instruction': \"What's the most most common type of cloud?\",\n# 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n# 'distilabel_metadata': {...},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n# }\n# ]\n Source code in src/distilabel/steps/tasks/urial.py class URIAL(Task):\n \"\"\"Generates a response using a non-instruct fine-tuned model.\n\n `URIAL` is a pre-defined task that generates a response using a non-instruct fine-tuned\n model. This task is used to generate a response based on the conversation provided as\n input.\n\n Input columns:\n - instruction (`str`, optional): The instruction to generate a response from.\n - conversation (`List[Dict[str, str]]`, optional): The conversation to generate\n a response from (the last message must be from the user).\n\n Output columns:\n - generation (`str`): The generated response.\n - model_name (`str`): The name of the model used to generate the response.\n\n Categories:\n - text-generation\n\n References:\n - [The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning](https://arxiv.org/abs/2312.01552)\n\n Examples:\n Generate text from an instruction:\n\n ```python\n from distilabel.models import vLLM\n from distilabel.steps.tasks import URIAL\n\n step = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n )\n\n step.load()\n\n results = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n )\n # [\n # {\n # 'instruction': \"What's the most most common type of cloud?\",\n # 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n # 'distilabel_metadata': {...},\n # 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n # }\n # ]\n ```\n \"\"\"\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"urial.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n\n @property\n def inputs(self) -> \"StepColumns\":\n return {\"instruction\": False, \"conversation\": False}\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n messages = (\n [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n if \"instruction\" in input\n else input[\"conversation\"]\n )\n\n if messages[-1][\"role\"] != \"user\":\n raise ValueError(\"The last message must be from the user.\")\n\n return [{\"role\": \"user\", \"content\": self._template.render(messages=messages)}]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"generation\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n if output is None:\n return {\"generation\": None}\n\n response = output.split(\"\\n\\n# User\")[0]\n if response.startswith(\"\\n\\n\"):\n response = response[2:]\n response = response.strip()\n\n return {\"generation\": response}\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL.load","title":"load() ","text":"Loads the Jinja2 template for the given aspect . Source code in src/distilabel/steps/tasks/urial.py def load(self) -> None:\n \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n super().load()\n\n _path = str(\n importlib_resources.files(\"distilabel\")\n / \"steps\"\n / \"tasks\"\n / \"templates\"\n / \"urial.jinja2\"\n )\n\n self._template = Template(open(_path).read())\n "},{"location":"api/task/task_gallery/#distilabel.steps.tasks.task","title":"task(inputs=None, outputs=None) ","text":"Creates a Task from a formatting output function. Parameters: Name Type Description Default inputs Union[StepColumns, None] a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None outputs Union[StepColumns, None] a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None . None Source code in src/distilabel/steps/tasks/decorator.py def task(\n inputs: Union[\"StepColumns\", None] = None,\n outputs: Union[\"StepColumns\", None] = None,\n) -> Callable[..., Type[\"Task\"]]:\n \"\"\"Creates a `Task` from a formatting output function.\n\n Args:\n inputs: a list containing the name of the inputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column is required or not, that are required by the step. If not provided\n the default will be an empty list `[]` and it will be assumed that the step\n doesn't need any specific columns. Defaults to `None`.\n outputs: a list containing the name of the outputs columns/keys or a dictionary\n where the keys are the columns and the values are booleans indicating whether\n the column will be generated or not. If not provided the default will be an\n empty list `[]` and it will be assumed that the step doesn't need any specific\n columns. Defaults to `None`.\n \"\"\"\n\n inputs = inputs or []\n outputs = outputs or []\n\n def decorator(func: TaskFormattingOutputFunc) -> Type[\"Task\"]:\n doc = inspect.getdoc(func)\n if doc is None:\n raise DistilabelUserError(\n \"When using the `task` decorator, including a docstring in the formatting\"\n \" function is mandatory. The docstring must follow the format described\"\n \" in the documentation.\",\n page=\"\",\n )\n\n system_prompt, user_message_template = _parse_docstring(doc)\n _validate_templates(inputs, system_prompt, user_message_template)\n\n def inputs_property(self) -> \"StepColumns\":\n return inputs\n\n def outputs_property(self) -> \"StepColumns\":\n return outputs\n\n def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n return [\n {\"role\": \"system\", \"content\": system_prompt.format(**input)},\n {\"role\": \"user\", \"content\": user_message_template.format(**input)},\n ]\n\n def format_output(\n self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n ) -> Dict[str, Any]:\n return func(output, input)\n\n return type(\n func.__name__,\n (Task,),\n {\n \"inputs\": property(inputs_property),\n \"outputs\": property(outputs_property),\n \"__module__\": func.__module__,\n \"format_input\": format_input,\n \"format_output\": format_output,\n },\n )\n\n return decorator\n "},{"location":"api/task/typing/","title":"Task Typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing","title":"typing ","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ChatType","title":"ChatType = List[ChatItem] module-attribute ","text":"ChatType is a type alias for a list of dict s following the OpenAI conversational format. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredOutputType","title":"StructuredOutputType = Union[OutlinesStructuredOutputType, InstructorStructuredOutputType] module-attribute ","text":"StructuredOutputType is an alias for the union of OutlinesStructuredOutputType and InstructorStructuredOutputType . "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StandardInput","title":"StandardInput = ChatType module-attribute ","text":"StandardInput is an alias for ChatType that defines the default / standard input produced by format_input . "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredInput","title":"StructuredInput = Tuple[StandardInput, Union[StructuredOutputType, None]] module-attribute ","text":"StructuredInput defines a type produced by format_input when using either StructuredGeneration or a subclass of it. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.FormattedInput","title":"FormattedInput = Union[StandardInput, StructuredInput, ChatType] module-attribute ","text":"FormattedInput is an alias for the union of StandardInput and StructuredInput as generated by format_input and expected by the LLM s, as well as ConversationType for the vision language models. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageUrl","title":"ImageUrl ","text":" Bases: TypedDict Source code in src/distilabel/steps/tasks/typing.py class ImageUrl(TypedDict):\n url: Required[str]\n \"\"\"Either a URL of the image or the base64 encoded image data.\"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageUrl.url","title":"url: Required[str] instance-attribute ","text":"Either a URL of the image or the base64 encoded image data. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ImageContent","title":"ImageContent ","text":" Bases: TypedDict Type alias for the user's message in a conversation that can include text or an image. It's the standard type for vision language models: https://platform.openai.com/docs/guides/vision Source code in src/distilabel/steps/tasks/typing.py class ImageContent(TypedDict, total=False):\n \"\"\"Type alias for the user's message in a conversation that can include text or an image.\n It's the standard type for vision language models:\n https://platform.openai.com/docs/guides/vision\n \"\"\"\n\n type: Required[Literal[\"image_url\"]]\n image_url: Required[ImageUrl]\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType","title":"OutlinesStructuredOutputType ","text":" Bases: TypedDict TypedDict to represent the structured output configuration from outlines . Source code in src/distilabel/steps/tasks/typing.py class OutlinesStructuredOutputType(TypedDict, total=False):\n \"\"\"TypedDict to represent the structured output configuration from `outlines`.\"\"\"\n\n format: Literal[\"json\", \"regex\"]\n \"\"\"One of \"json\" or \"regex\".\"\"\"\n schema: Union[str, Type[BaseModel], Dict[str, Any]]\n \"\"\"The schema to use for the structured output. If \"json\", it\n can be a pydantic.BaseModel class, or the schema as a string,\n as obtained from `model_to_schema(BaseModel)`, if \"regex\", it\n should be a regex pattern as a string.\n \"\"\"\n whitespace_pattern: Optional[Union[str, List[str]]]\n \"\"\"If \"json\" corresponds to a string or a list of\n strings with a pattern (doesn't impact string literals).\n For example, to allow only a single space or newline with\n `whitespace_pattern=r\"[\\n ]?\"`\n \"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.format","title":"format: Literal['json', 'regex'] instance-attribute ","text":"One of \"json\" or \"regex\". "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.schema","title":"schema: Union[str, Type[BaseModel], Dict[str, Any]] instance-attribute ","text":"The schema to use for the structured output. If \"json\", it can be a pydantic.BaseModel class, or the schema as a string, as obtained from model_to_schema(BaseModel) , if \"regex\", it should be a regex pattern as a string. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.whitespace_pattern","title":"whitespace_pattern: Optional[Union[str, List[str]]] instance-attribute ","text":"If \"json\" corresponds to a string or a list of strings with a pattern (doesn't impact string literals). For example, to allow only a single space or newline with whitespace_pattern=r\"[ ]?\" "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType","title":"InstructorStructuredOutputType ","text":" Bases: TypedDict TypedDict to represent the structured output configuration from instructor . Source code in src/distilabel/steps/tasks/typing.py class InstructorStructuredOutputType(TypedDict, total=False):\n \"\"\"TypedDict to represent the structured output configuration from `instructor`.\"\"\"\n\n format: Optional[Literal[\"json\"]]\n \"\"\"One of \"json\".\"\"\"\n schema: Union[Type[BaseModel], Dict[str, Any]]\n \"\"\"The schema to use for the structured output, a `pydantic.BaseModel` class. \"\"\"\n mode: Optional[str]\n \"\"\"Generation mode. Take a look at `instructor.Mode` for more information, if not informed it will\n be determined automatically. \"\"\"\n max_retries: int\n \"\"\"Number of times to reask the model in case of error, if not set will default to the model's default. \"\"\"\n "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.format","title":"format: Optional[Literal['json']] instance-attribute ","text":"One of \"json\". "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.schema","title":"schema: Union[Type[BaseModel], Dict[str, Any]] instance-attribute ","text":"The schema to use for the structured output, a pydantic.BaseModel class. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.mode","title":"mode: Optional[str] instance-attribute ","text":"Generation mode. Take a look at instructor.Mode for more information, if not informed it will be determined automatically. "},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.max_retries","title":"max_retries: int instance-attribute ","text":"Number of times to reask the model in case of error, if not set will default to the model's default. "},{"location":"sections/community/","title":"Community","text":"We are an open-source community-driven project not only focused on building a great product but also on building a great community, where you can get support, share your experiences, and contribute to the project! We would love to hear from you and help you get started with distilabel. -
Discord In our Discord channels (#argilla-general and #argilla-help), you can get direct support from the community. Discord \u2197 -
Community Meetup We host bi-weekly community meetups where you can listen in or present your work. Community Meetup \u2197 -
Changelog The changelog is where you can find the latest updates and changes to the distilabel project. Changelog \u2197 -
Roadmap We love to discuss our plans with the community. Feel encouraged to participate in our roadmap discussions. Roadmap \u2197 "},{"location":"sections/community/#badges","title":"Badges","text":"If you build something cool with distilabel consider adding one of these badges to your dataset or model card. [<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-light.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n [<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-dark.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n "},{"location":"sections/community/#contribute","title":"Contribute","text":"To directly contribute with distilabel , check our good first issues or open a new one. "},{"location":"sections/community/contributor/","title":"How to contribute?","text":"Thank you for investing your time in contributing to the project! Any contribution you make will be reflected in the most recent version of distilabel \ud83e\udd29. New to contributing in general? If you're a new contributor, read the README to get an overview of the project. In addition, here are some resources to help you get started with open-source contributions: - Discord: You are welcome to join the distilabel Discord community, where you can keep in touch with other users, contributors and the distilabel team. In the following section, you can find more information on how to get started in Discord.
- Git: This is a very useful tool to keep track of the changes in your files. Using the command-line interface (CLI), you can make your contributions easily. For that, you need to have it installed and updated on your computer.
- GitHub: It is a platform and cloud-based service that uses git and allows developers to collaborate on projects. To contribute to distilabel, you'll need to create an account. Check the Contributor Workflow with Git and Github for more info.
- Developer Documentation: To collaborate, you'll need to set up an efficient environment. Check the Installation guide to know how to do it.
"},{"location":"sections/community/contributor/#first-contact-in-discord","title":"First Contact in Discord","text":"Discord is a handy tool for more casual conversations and to answer day-to-day questions. As part of Hugging Face, we have set up some distilabel channels on the server. Click here to join the Hugging Face Discord community effortlessly. When part of the Hugging Face Discord, you can select \"Channels & roles\" and select \"Argilla\" along with any of the other groups that are interesting to you. \"Argilla\" will cover anything about argilla and distilabel. You can join the following channels: - #argilla-distilabel-announcements: \ud83d\udce3 Stay up-to-date.
- #argilla-distilabel-general: \ud83d\udcac For general discussions.
- #argilla-distilabel-help: \ud83d\ude4b\u200d\u2640\ufe0f Need assistance? We're always here to help. Select the appropriate label (argilla or distilabel) for your issue and post it.
So now there is only one thing left to do: introduce yourself and talk to the community. You'll always be welcome! \ud83e\udd17\ud83d\udc4b "},{"location":"sections/community/contributor/#contributor-workflow-with-git-and-github","title":"Contributor Workflow with Git and GitHub","text":"If you're working with distilabel and suddenly a new idea comes to your mind or you find an issue that can be improved, it's time to actively participate and contribute to the project! "},{"location":"sections/community/contributor/#report-an-issue","title":"Report an issue","text":"If you spot a problem, search if an issue already exists, you can use the Label filter. If that is the case, participate in the conversation. If it does not exist, create an issue by clicking on New Issue . This will show various templates; choose the one that best suits your issue. Once you choose one, you will need to fill it in following the guidelines. Try to be as clear as possible. In addition, you can assign yourself to the issue and add or choose the right labels. Finally, click on Submit new issue . "},{"location":"sections/community/contributor/#work-with-a-fork","title":"Work with a fork","text":""},{"location":"sections/community/contributor/#fork-the-distilabel-repository","title":"Fork the distilabel repository","text":"After having reported the issue, you can start working on it. For that, you will need to create a fork of the project. To do that, click on the Fork button. Now, fill in the information. Remember to uncheck the Copy develop branch only if you are going to work in or from another branch (for instance, to fix documentation, the main branch is used). Then, click on Create fork . You will be redirected to your fork. You can see that you are in your fork because the name of the repository will be your username/distilabel , and it will indicate forked from argilla-io/distilabel . "},{"location":"sections/community/contributor/#clone-your-forked-repository","title":"Clone your forked repository","text":"In order to make the required adjustments, clone the forked repository to your local machine. Choose the destination folder and run the following command: git clone https://github.com/[your-github-username]/distilabel.git\ncd distilabel\n To keep your fork\u2019s main/develop branch up to date with our repo, add it as an upstream remote branch. git remote add upstream https://github.com/argilla-io/distilabel.git\n "},{"location":"sections/community/contributor/#create-a-new-branch","title":"Create a new branch","text":"For each issue you're addressing, it's advisable to create a new branch. GitHub offers a straightforward method to streamline this process. \u26a0\ufe0f Never work directly on the main or develop branch. Always create a new branch for your changes. Navigate to your issue, and on the right column, select Create a branch . After the new window pops up, the branch will be named after the issue and include a prefix such as feature/, bug/, or docs/ to facilitate quick recognition of the issue type. In the Repository destination , pick your fork ( [your-github-username]/distilabel), and then select Change branch source to specify the source branch for creating the new one. Complete the process by clicking Create branch . \ud83e\udd14 Remember that the main branch is only used to work with the documentation. For any other changes, use the develop branch. Now, locally, change to the new branch you just created. git fetch origin\ngit checkout [branch-name]\n "},{"location":"sections/community/contributor/#make-changes-and-push-them","title":"Make changes and push them","text":"Make the changes you want in your local repository, and test that everything works and you are following the guidelines. Once you have finished, you can check the status of your repository and synchronize with the upstreaming repo with the following command: # Check the status of your repository\ngit status\n\n# Synchronize with the upstreaming repo\ngit checkout [branch-name]\ngit rebase [default-branch]\n If everything is right, we need to commit and push the changes to your fork. For that, run the following commands: # Add the changes to the staging area\ngit add filename\n\n# Commit the changes by writing a proper message\ngit commit -m \"commit-message\"\n\n# Push the changes to your fork\ngit push origin [branch-name]\n When pushing, you will be asked to enter your GitHub login credentials. Once the push is complete, all local commits will be on your GitHub repository. "},{"location":"sections/community/contributor/#create-a-pull-request","title":"Create a pull request","text":"Come back to GitHub, navigate to the original repository where you created your fork, and click on Compare & pull request . First, click on compare across forks and select the right repositories and branches. In the base repository, keep in mind that you should select either main or develop based on the modifications made. In the head repository, indicate your forked repository and the branch corresponding to the issue. Then, fill in the pull request template. You should add a prefix to the PR name, as we did with the branch above. If you are working on a new feature, you can name your PR as feat: TITLE . If your PR consists of a solution for a bug, you can name your PR as bug: TITLE . And, if your work is for improving the documentation, you can name your PR as docs: TITLE . In addition, on the right side, you can select a reviewer (for instance, if you discussed the issue with a member of the team) and assign the pull request to yourself. It is highly advisable to add labels to PR as well. You can do this again by the labels section right on the screen. For instance, if you are addressing a bug, add the bug label, or if the PR is related to the documentation, add the documentation label. This way, PRs can be easily filtered. Finally, fill in the template carefully and follow the guidelines. Remember to link the original issue and enable the checkbox to allow maintainer edits so the branch can be updated for a merge. Then, click on Create pull request . For the PR body, ensure you give a description of what the PR contains, and add examples if possible (and if they apply to the contribution) to help with the review process. You can take a look at #PR 974 or #PR 983 for examples of typical PRs. "},{"location":"sections/community/contributor/#review-your-pull-request","title":"Review your pull request","text":"Once you submit your PR, a team member will review your proposal. We may ask questions, request additional information, or ask for changes to be made before a PR can be merged, either using suggested changes or pull request comments. You can apply the changes directly through the UI (check the files changed and click on the right-corner three dots; see image below) or from your fork, and then commit them to your branch. The PR will be updated automatically, and the suggestions will appear as outdated . If you run into any merge issues, check out this git tutorial to help you resolve merge conflicts and other issues. "},{"location":"sections/community/contributor/#your-pr-is-merged","title":"Your PR is merged!","text":"Congratulations \ud83c\udf89\ud83c\udf8a We thank you \ud83e\udd29 Once your PR is merged, your contributions will be publicly visible on the distilabel GitHub. Additionally, we will include your changes in the next release based on our development branch. "},{"location":"sections/community/contributor/#additional-resources","title":"Additional resources","text":"Here are some helpful resources for your reference. - Configuring Discord, a guide to learning how to get started with Discord.
- Pro Git, a book to learn Git.
- Git in VSCode, a guide to learning how to easily use Git in VSCode.
- GitHub Skills, an interactive course for learning GitHub.
"},{"location":"sections/community/developer_documentation/","title":"Developer Documentation","text":"Thank you for investing your time in contributing to the project! If you don't have the repository locally, and need any help, go to the contributor guide and read the contributor workflow with Git and GitHub first. "},{"location":"sections/community/developer_documentation/#set-up-the-python-environment","title":"Set up the Python environment","text":"To work on the distilabel , you must install the package on your system. Tip This guide will use uv , but pip and venv can be used as well, this guide can work quite similar with both options. From the root of the cloned Distilabel repository, you should move to the distilabel folder in your terminal. cd distilabel\n "},{"location":"sections/community/developer_documentation/#create-a-virtual-environment","title":"Create a virtual environment","text":"The first step will be creating a virtual environment to keep our dependencies isolated. Here we are choosing python 3.11 (uv venv documentation), and then activate it: uv venv .venv --python 3.11\nsource .venv/bin/activate\n "},{"location":"sections/community/developer_documentation/#install-the-project","title":"Install the project","text":"Installing from local (we are using uv pip ): uv pip install -e .\n We have extra dependencies with their name, depending on the part you are working on, you may want to install some dependency (take a look at pyproject.toml in the repo to see all the extra dependencies): uv pip install -e \".[vllm,outlines]\"\n "},{"location":"sections/community/developer_documentation/#linting-and-formatting","title":"Linting and formatting","text":"To maintain a consistent code format, install the pre-commit hooks to run before each commit automatically (we rely heavily on ruff ): uv pip install -e \".[dev]\"\npre-commit install\n "},{"location":"sections/community/developer_documentation/#running-tests","title":"Running tests","text":"All the changes you add to the codebase should come with tests, either unit or integration tests, depending on the type of change, which are placed under tests/unit and tests/integration respectively. Start by installing the tests dependencies: uv pip install \".[tests]\"\n Running the whole tests suite may take some time, and you will need all the dependencies installed, so just run your tests, and the whole tests suite will be run for you in the CI: # Run specific tests\npytest tests/unit/steps/generators/test_data.py\n "},{"location":"sections/community/developer_documentation/#set-up-the-documentation","title":"Set up the documentation","text":"To contribute to the documentation and generate it locally, ensure you have installed the development dependencies: uv pip install -e \".[docs]\"\n And run the following command to create the development server with mkdocs : mkdocs serve\n "},{"location":"sections/community/developer_documentation/#documentation-guidelines","title":"Documentation guidelines","text":"As mentioned, we use mkdocs to build the documentation. You can write the documentation in markdown format, and it will automatically be converted to HTML. In addition, you can include elements such as tables, tabs, images, and others, as shown in this guide. We recommend following these guidelines: -
Use clear and concise language: Ensure the documentation is easy to understand for all users by using straightforward language and including meaningful examples. Images are not easy to maintain, so use them only when necessary and place them in the appropriate folder within the docs/assets/images directory. -
Verify code snippets: Double-check that all code snippets are correct and runnable. -
Review spelling and grammar: Check the spelling and grammar of the documentation. -
Update the table of contents: If you add a new page, include it in the relevant index.md or the mkdocs.yml file. "},{"location":"sections/community/developer_documentation/#components-gallery","title":"Components gallery","text":"The components gallery section of the documentation is automatically generated thanks to a custom plugin, it will be run when mkdocs serve is called. This guide to the steps helps us visualize each step, as well as examples of use. Note Changes done to the docstrings of Steps/Tasks/LLMs won't appear in the components gallery automatically, you will have to stop the mkdocs server and run it again to see the changes, everything else is reloaded automatically. "},{"location":"sections/community/popular_issues/","title":"Issue dashboard","text":"Most engaging open issuesLatest issues open by the communityPlanned issues for upcoming releases Rank Issue Reactions Comments 1 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API \ud83d\udc4d 2 \ud83d\udcac 1 2 995 - [FEATURE] mlx-lm integration \ud83d\udc4d 2 \ud83d\udcac 1 3 737 - [FEATURE] Allow FormatTextGenerationSFT to include tools/function calls in the formatted messages. \ud83d\udc4d 2 \ud83d\udcac 0 4 1001 - [FEATURE] sglang integration \ud83d\udc4d 1 \ud83d\udcac 1 5 797 - [FEATURE] synthetic data generation for predictive NLP tasks \ud83d\udc4d 1 \ud83d\udcac 1 6 914 - [FEATURE] Use Step.resources to set tensor_parallel_size and pipeline_parallel_size in vLLM \ud83d\udc4d 1 \ud83d\udcac 0 7 588 - [FEATURE] Single request caching \ud83d\udc4d 1 \ud83d\udcac 0 8 953 - [EXAMPLE] Add CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through Corpus Retrieval and Augmentation example \ud83d\udc4d 0 \ud83d\udcac 6 9 972 - [BUG] Input data size != output data size when task batch size < batch size of predecessor \ud83d\udc4d 0 \ud83d\udcac 4 10 859 - [FEATURE] Update PushToHub to stream data to the Hub \ud83d\udc4d 0 \ud83d\udcac 4 Rank Issue Author 1 \ud83d\udfe2 1070 - [BUG] Pipeline serialization/caching issue when including RoutingBatchFunction by liamcripwell 2 \ud83d\udfe2 1068 - [BUG] GenerateSentencePair(...) always returns None positive and negative pairs by caesar-one 3 \ud83d\udfe2 1064 - [DOCS] Update basic guides of steps and tasks by plaguss 4 \ud83d\udfe2 1058 - [FEATURE] Implement a rate limiter for API calls by plaguss 5 \ud83d\udfe3 1056 - [DOCS] The example on how to use a Step no longer works by wwymak 6 \ud83d\udfe3 1049 - [BUG] vLLM Task not utilizing multiple GPUs in parallel when replicas > 1 by adamlin120 7 \ud83d\udfe3 1048 - [BUG] OepnAI JSON format by tinyrolls 8 \ud83d\udfe2 1047 - Failed to load all the steps. Could not run pipeline. by yuqie 9 \ud83d\udfe2 1046 - [FEATURE] Compute the input/output tokens of a dataset by plaguss 10 \ud83d\udfe3 1044 - Receiving error: The number of required GPUs exceeds the total number of available GPUs in the placement group by saurabhbbjain Rank Issue Milestone 1 \ud83d\udfe2 579 - [FEATURE] Sequential execution for local pipeline 1.4.0 2 \ud83d\udfe2 771 - [FEATURE] Allow passing path to YAML file containing pipeline runtime parameters in distilabel run 1.4.0 3 \ud83d\udfe2 773 - [DOCS] Include section/guide describing pipeline patterns 1.4.0 4 \ud83d\udfe2 802 - [FEATURE] Add defaults to Steps and Tasks so they can be more easily connected 1.4.0 5 \ud83d\udfe2 880 - [FEATURE] Add exclude_from_signature attribute 1.4.0 6 \ud83d\udfe2 942 - [BUG] make_generator_step can fail when setting the _dataset_info internally 1.4.0 7 \ud83d\udfe2 662 - [FEATURE] Allow passing self to steps created with step decorator 1.4.0 8 \ud83d\udfe2 889 - [FEATURE] Replace extra_sampling_params for normal arguments in vLLM 1.4.0 9 \ud83d\udfe2 738 - [FEATURE] Update LLM.generate interface to allow returning arbitrary/extra stuff related to the generation 1.5.0 10 \ud83d\udfe2 749 - [IMPLEMENTATION] Self-play with Execution Feedback: Improving Instruction-following Capabilities of Large Language Models 1.5.0 Last update: 2024-12-18 "},{"location":"sections/getting_started/faq/","title":"Frequent Asked Questions (FAQ)","text":"How can I rename the columns in a batch? Every Step has both input_mappings and output_mappings attributes that can be used to rename the columns in each batch. But input_mappings will only map, meaning that if you have a batch with the column A and you want to rename it to B , you should use input_mappings={\"A\": \"B\"} , but that will only be applied to that specific Step meaning that the next step in the pipeline will still have the column A instead of B . While output_mappings will indeed apply the rename, meaning that if the Step produces the column A and you want to rename to B , you should use output_mappings={\"A\": \"B\"} , and that will be applied to the next Step in the pipeline. Will the API Keys be exposed when sharing the pipeline? No, those will be masked out using pydantic.SecretStr , meaning that those won't be exposed when sharing the pipeline. This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via an attribute or runtime parameter, you will need to provide them again. Does it work for Windows? Yes, but you may need to set the multiprocessing context in advance to ensure that the spawn method is used since the default method fork is not available on Windows. import multiprocessing as mp\n\nmp.set_start_method(\"spawn\")\n Will the custom Steps / Tasks / LLMs be serialized too? No, at the moment, only the references to the classes within the distilabel library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available unless used from the same file. What happens if Pipeline.run fails? Do I lose all the data? No, indeed, we're using a cache mechanism to store all the intermediate results in the disk so, if a Step fails; the pipeline can be re-run from that point without losing the data, only if nothing is changed in the Pipeline . All the data will be stored in .cache/distilabel , but the only data that will persist at the end of the Pipeline.run execution is the one from the leaf step/s, so bear that in mind. For more information on the caching mechanism in distilabel , you can check the Learn - Advanced - Caching section. Also, note that when running a Step or a Task standalone, the cache mechanism won't be used, so if you want to use that, you should use the Pipeline context manager. How can I use the same LLM across several tasks without having to load it several times? You can serve the LLM using a solution like TGI or vLLM, and then connect to it using an AsyncLLM client like InferenceEndpointsLLM or OpenAILLM . Please refer to Serving LLMs guide for more information. Can distilabel be used with OpenAI Batch API? Yes, distilabel is integrated with OpenAI Batch API via OpenAILLM. Check LLMs - Offline Batch Generation for a small example on how to use it and Advanced - Offline Batch Generation for a more detailed guide. Prevent overloads on Free Serverless Endpoints When running a task using the InferenceEndpointsLLM with Free Serverless Endpoints, you may be facing some errors such as Model is overloaded if you let the batch size to the default (set at 50). To fix the issue, lower the value or even better set input_batch_size=1 in your task. It may take a longer time to finish, but please remember this is a free service. from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import TextGeneration\n\nTextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=1\n)\n "},{"location":"sections/getting_started/installation/","title":"Installation","text":"You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress. To install the latest release of the package from PyPI you can use the following command: pip install distilabel --upgrade\n Alternatively, you may also want to install it from source i.e. the latest unreleased version, you can use the following command: pip install \"distilabel @ git+https://github.com/argilla-io/distilabel.git@develop\" --upgrade\n Note We are installing from develop since that's the branch we use to collect all the features, bug fixes, and improvements that will be part of the next release. If you want to install from a specific branch, you can replace develop with the branch name. "},{"location":"sections/getting_started/installation/#extras","title":"Extras","text":"Additionally, as part of distilabel some extra dependencies are available, mainly to add support for some of the LLM integrations we support. Here's a list of the available extras: "},{"location":"sections/getting_started/installation/#llms","title":"LLMs","text":" -
anthropic : for using models available in Anthropic API via the AnthropicLLM integration. -
argilla : for exporting the generated datasets to Argilla. -
cohere : for using models available in Cohere via the CohereLLM integration. -
groq : for using models available in Groq using groq Python client via the GroqLLM integration. -
hf-inference-endpoints : for using the Hugging Face Inference Endpoints via the InferenceEndpointsLLM integration. -
hf-transformers : for using models available in transformers package via the TransformersLLM integration. -
litellm : for using LiteLLM to call any LLM using OpenAI format via the LiteLLM integration. -
llama-cpp : for using llama-cpp-python Python bindings for llama.cpp via the LlamaCppLLM integration. -
mistralai : for using models available in Mistral AI API via the MistralAILLM integration. -
ollama : for using Ollama and their available models via OllamaLLM integration. -
openai : for using OpenAI API models via the OpenAILLM integration, or the rest of the integrations based on OpenAI and relying on its client as AnyscaleLLM , AzureOpenAILLM , and TogetherLLM . -
vertexai : for using Google Vertex AI proprietary models via the VertexAILLM integration. -
vllm : for using vllm serving engine via the vLLM integration. -
sentence-transformers : for generating sentence embeddings using sentence-transformers. "},{"location":"sections/getting_started/installation/#data-processing","title":"Data processing","text":" -
ray : for scaling and distributing a pipeline with Ray. -
faiss-cpu and faiss-gpu : for generating sentence embeddings using faiss. -
minhash : for using minhash for duplicate detection with datasketch and nltk. -
text-clustering : for using text clustering with UMAP and Scikit-learn. "},{"location":"sections/getting_started/installation/#structured-generation","title":"Structured generation","text":""},{"location":"sections/getting_started/installation/#recommendations-notes","title":"Recommendations / Notes","text":"The mistralai dependency requires Python 3.9 or higher, so if you're willing to use the distilabel.models.llms.MistralLLM implementation, you will need to have Python 3.9 or higher. In some cases like transformers and vllm , the installation of flash-attn is recommended if you are using a GPU accelerator since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the distilabel dependencies. pip install flash-attn --no-build-isolation\n Also, if you are willing to use the llama-cpp-python integration for running local LLMs, note that the installation process may get a bit trickier depending on which OS are you using, so we recommend you to read through their Installation section in their docs. "},{"location":"sections/getting_started/quickstart/","title":"Quickstart","text":""},{"location":"sections/getting_started/quickstart/#quickstart","title":"Quickstart","text":"Distilabel provides all the tools you need to your scalable and reliable pipelines for synthetic data generation and AI-feedback. Pipelines are used to generate data, evaluate models, manipulate data, or any other general task. They are made up of different components: Steps, Tasks and LLMs, which are chained together in a directed acyclic graph (DAG). - Steps: These are the building blocks of your pipeline. Normal steps are used for basic executions like loading data, applying some transformations, or any other general task.
- Tasks: These are steps that rely on LLMs and prompts to perform generative tasks. For example, they can be used to generate data, evaluate models or manipulate data.
- LLMs: These are the models that will perform the task. They can be local or remote models, and open-source or commercial models.
Pipelines are designed to be scalable and reliable. They can be executed in a distributed manner, and they can be cached and recovered. This is useful when dealing with large datasets or when you want to ensure that your pipeline is reproducible. Besides that, pipelines are designed to be modular and flexible. You can easily add new steps, tasks, or LLMs to your pipeline, and you can also easily modify or remove them. An example architecture of a pipeline to generate a dataset of preferences is the following: "},{"location":"sections/getting_started/quickstart/#installation","title":"Installation","text":"To install the latest release with hf-inference-endpoints extra of the package from PyPI you can use the following command: pip install distilabel[hf-inference-endpoints] --upgrade\n "},{"location":"sections/getting_started/quickstart/#use-a-generic-pipeline","title":"Use a generic pipeline","text":"To use a generic pipeline for an ML task, you can use the InstructionResponsePipeline class. This class is a generic pipeline that can be used to generate data for supervised fine-tuning tasks. It uses the InferenceEndpointsLLM class to generate data based on the input data and the model. from distilabel.pipeline import InstructionResponsePipeline\n\npipeline = InstructionResponsePipeline()\ndataset = pipeline.run()\n The InstructionResponsePipeline class will use the InferenceEndpointsLLM class with the model meta-llama/Meta-Llama-3.1-8B-Instruct to generate data based on the system prompt. The output data will be a dataset with the columns instruction and response . The class uses a generic system prompt, but you can customize it by passing the system_prompt parameter to the class. Note We're actively working on building more pipelines for different tasks. If you have any suggestions or requests, please let us know! We're currently working on pipelines for classification, Direct Preference Optimization, and Information Retrieval tasks. "},{"location":"sections/getting_started/quickstart/#define-a-custom-pipeline","title":"Define a Custom pipeline","text":"In this guide we will walk you through the process of creating a simple pipeline that uses the InferenceEndpointsLLM class to generate text. The Pipeline will load a dataset that contains a column named prompt from the Hugging Face Hub via the step LoadDataFromHub and then use the InferenceEndpointsLLM class to generate text based on the dataset using the TextGeneration task. You can check the available models in the Hugging Face Model Hub and filter by Inference status . from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline( # (1)\n name=\"simple-text-generation-pipeline\",\n description=\"A simple text generation pipeline\",\n) as pipeline: # (2)\n load_dataset = LoadDataFromHub( # (3)\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n text_generation = TextGeneration( # (4)\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n ), # (5)\n system_prompt=\"You are a creative AI Assistant writer.\",\n template=\"Follow the following instruction: {{ instruction }}\" # (6)\n )\n\n load_dataset >> text_generation # (7)\n\nif __name__ == \"__main__\":\n distiset = pipeline.run( # (8)\n parameters={\n load_dataset.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n distiset.push_to_hub(repo_id=\"distilabel-example\") # (9)\n -
We define a Pipeline with the name simple-text-generation-pipeline and a description A simple text generation pipeline . Note that the name is mandatory and will be used to calculate the cache signature path, so changing the name will change the cache path and will be identified as a different pipeline. -
We are using the Pipeline context manager, meaning that every Step subclass that is defined within the context manager will be added to the pipeline automatically. -
We define a LoadDataFromHub step named load_dataset that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the pipeline.run method below, but it can also be defined within the class instance via the arg repo_id=... . This step will produce output batches with the rows from the dataset, and the column prompt will be mapped to the instruction field. -
We define a TextGeneration task named text_generation that will generate text based on the instruction field from the dataset. This task will use the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct . -
We define the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct that will be used by the TextGeneration task. In this case, since the InferenceEndpointsLLM is used, we assume that the HF_TOKEN environment variable is set. -
Both system_prompt and template are optional fields. The template must be informed as a string following the Jinja2 template format, and the fields that appear there (\"instruction\" in this case, which corresponds to the default) must be informed in the columns attribute. The component gallery for TextGeneration has examples to get you started. -
We connect the load_dataset step to the text_generation task using the rshift operator, meaning that the output from the load_dataset step will be used as input for the text_generation task. -
We run the pipeline with the parameters for the load_dataset and text_generation steps. The load_dataset step will use the repository distilabel-internal-testing/instruction-dataset-mini and the test split, and the text_generation task will use the generation_kwargs with the temperature set to 0.7 and the max_new_tokens set to 512 . -
Optionally, we can push the generated Distiset to the Hugging Face Hub repository distilabel-example . This will allow you to share the generated dataset with others and use it in other pipelines. "},{"location":"sections/how_to_guides/","title":"How-to guides","text":"Welcome to the how-to guides section! Here you will find a collection of guides that will help you get started with Distilabel. We have divided the guides into two categories: basic and advanced. The basic guides will help you get started with the core concepts of Distilabel, while the advanced guides will help you explore more advanced features. "},{"location":"sections/how_to_guides/#basic","title":"Basic","text":" -
Define Steps for your Pipeline Steps are the building blocks of your pipeline. They can be used to generate data, evaluate models, manipulate data, or any other general task. Define Steps -
Define Tasks that rely on LLMs Tasks are a specific type of step that rely on Language Models (LLMs) to generate data. Define Tasks -
Define LLMs as local or remote models LLMs are the core of your tasks. They are used to integrate with local models or remote APIs. Define LLMs -
Execute Steps and Tasks in a Pipeline Pipeline is where you put all your steps and tasks together to create a workflow. Execute Pipeline "},{"location":"sections/how_to_guides/#advanced","title":"Advanced","text":" -
Using the Distiset dataset object Distiset is a dataset object based on the datasets library that can be used to store and manipulate data. Distiset -
Export data to Argilla Argilla is a platform that can be used to store, search, and apply feedback to datasets. Argilla -
Using a file system to pass data of batches between steps File system can be used to pass data between steps in a pipeline. File System -
Using CLI to explore and re-run existing Pipelines CLI can be used to explore and re-run existing pipelines through the command line. CLI -
Cache and recover pipeline executions Caching can be used to recover pipeline executions to avoid loosing data and precious LLM calls. Caching -
Structured data generation Structured data generation can be used to generate data with a specific structure like JSON, function calls, etc. Structured Generation -
Serving an LLM for sharing it between several tasks Serve an LLM via TGI or vLLM to make requests and connect using a client like InferenceEndpointsLLM or OpenAILLM to avoid wasting resources. Sharing an LLM across tasks -
Impose requirements to your pipelines and steps Add requirements to steps in a pipeline to ensure they are installed and avoid errors. Pipeline requirements "},{"location":"sections/how_to_guides/advanced/argilla/","title":"Export data to Argilla","text":"Being able to export the generated synthetic datasets to Argilla, is a core feature within distilabel . We believe in the potential of synthetic data, but without removing the impact a human annotator or group of annotators can bring. So on, the Argilla integration makes it straightforward to push a dataset to Argilla while the Pipeline is running, to be able to follow along the generation process in Argilla's UI, as well as annotating the records on the fly. One can include a Step within the Pipeline to easily export the datasets to Argilla with a pre-defined configuration, suiting the annotation purposes. Before using any of the steps about to be described below, you should first have an Argilla instance up and running, so that you can successfully upload the data to Argilla. In order to deploy Argilla, the easiest and most straightforward way is to deploy it via the Argilla Template in Hugging Face Spaces as simply as following the steps there, or just via the following button: "},{"location":"sections/how_to_guides/advanced/argilla/#text-generation","title":"Text Generation","text":"For text generation scenarios, i.e. when the Pipeline contains a single TextGeneration step, we have designed the task TextGenerationToArgilla , which will seamlessly push the generated data to Argilla, and allow the annotator to review the records. The dataset will be pushed with the following configuration: -
Fields: instruction and generation , both being fields of type argilla.TextField , plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generation can either be a single string or a list of strings (useful when there are multiple parent nodes of type TextGeneration ); even though each record will always contain at most one instruction -generation pair. -
Questions: quality will be the only question for the annotators to answer, i.e., to annotate, and it will be an argilla.LabelQuestion referring to the quality of the provided generation for the given instruction. It can be annotated as either \ud83d\udc4e (bad) or \ud83d\udc4d (good). Note The TextGenerationToArgilla step will only work as is if the Pipeline contains one or multiple TextGeneration steps, or if the columns instruction and generation are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generation are mapped to one of the existing columns in the batch data. from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, TextGenerationToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[\n {\n \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n },\n ],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n )\n\n to_argilla = TextGenerationToArgilla(\n dataset_name=\"my-dataset\",\n dataset_workspace=\"admin\",\n api_url=\"<ARGILLA_API_URL>\",\n api_key=\"<ARGILLA_API_KEY>\",\n )\n\n load_dataset >> text_generation >> to_argilla\n\npipeline.run()\n "},{"location":"sections/how_to_guides/advanced/argilla/#preference","title":"Preference","text":"For preference scenarios, i.e. when the Pipeline contains multiple TextGeneration steps, we have designed the task PreferenceToArgilla , which will seamlessly push the generated data to Argilla, and allow the annotator to review the records. The dataset will be pushed with the following configuration: -
Fields: instruction and generations , both being fields of type argilla.TextField , plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generations must be a list of strings, containing the generated texts for the given instruction so that at least there are two generations to compare. Other than that, the number of generation fields within each record in Argilla will be defined by the value of the variable num_generations to be provided in the PreferenceToArgilla step. -
Questions: rating and rationale will be the pairs of questions to be defined per each generation i.e. per each value within the range from 0 to num_generations , and those will be of types argilla.RatingQuestion and argilla.TextQuestion , respectively. Note that only the first pair of questions will be mandatory, since only one generation is ensured to be within the batch data. Additionally, note that the provided ratings will range from 1 to 5, and to mention that Argilla only supports values above 0. Note The PreferenceToArgilla step will only work if the Pipeline contains multiple TextGeneration steps, or if the columns instruction and generations are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generations are mapped to one of the existing columns in the batch data. Note Additionally, if the Pipeline contains an UltraFeedback step, the ratings and rationales will also be available and be automatically injected as suggestions to the existing dataset. from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, PreferenceToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[\n {\n \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n },\n ],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n num_generations=4,\n group_generations=True,\n )\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"my-dataset\",\n dataset_workspace=\"admin\",\n api_url=\"<ARGILLA_API_URL>\",\n api_key=\"<ARGILLA_API_KEY>\",\n num_generations=4,\n )\n\n load_dataset >> text_generation >> to_argilla\n\nif __name__ == \"__main__\":\n pipeline.run()\n "},{"location":"sections/how_to_guides/advanced/assigning_resources_to_step/","title":"Assigning resources to a Step ","text":"When dealing with complex pipelines that get executed in a distributed environment with abundant resources (CPUs and GPUs), sometimes it's necessary to allocate these resources judiciously among the Step s. This is why distilabel allows to specify the number of replicas , cpus and gpus for each Step . Let's see that with an example: from distilabel.pipeline import Pipeline\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import PrometheusEval\n\n\nwith Pipeline(name=\"resources\") as pipeline:\n ...\n\n prometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]['content'] }}\\\\n{{ messages[1]['content'] }}[/INST]\",\n ),\n resources=StepResources(replicas=2, cpus=1, gpus=1)\n mode=\"absolute\",\n rubric=\"factual-validity\",\n reference=False,\n num_generations=1,\n group_generations=False,\n )\n In the example above, we're creating a PrometheusEval task (remember that Task s are Step s) that will use vLLM to serve prometheus-eval/prometheus-7b-v2.0 model. This task is resource intensive as it requires an LLM, which in turn requires a GPU to run fast. With that in mind, we have specified the resources required for the task using the StepResources class, and we have defined that we need 1 GPU and 1 CPU per replica of the task. In addition, we have defined that we need 2 replicas i.e. we will run two instances of the task so the computation for the whole dataset runs faster. In addition, StepResources uses the RuntimeParametersMixin, so we can also specify the resources for each step when running the pipeline: ...\n\nif __name__ == \"__main__\":\n pipeline.run(\n parameters={\n prometheus.name: {\"resources\": {\"replicas\": 2, \"cpus\": 1, \"gpus\": 1}}\n }\n )\n And that's it! When running the pipeline, distilabel will create the tasks in nodes that have available the specified resources. "},{"location":"sections/how_to_guides/advanced/caching/","title":"Pipeline cache","text":"distilabel will automatically save all the intermediate outputs generated by each Step of a Pipeline , so these outputs can be reused to recover the state of a pipeline execution that was stopped before finishing or to not have to re-execute steps from a pipeline after adding a new downstream step. "},{"location":"sections/how_to_guides/advanced/caching/#how-to-enabledisable-the-cache","title":"How to enable/disable the cache","text":"The use of the cache can be toggled using the use_cache parameter of the Pipeline.use_cache method. If True , then distilabel will use the reuse the outputs of previous executions for the new execution. If False , then distilabel will re-execute all the steps of the pipeline to generate new outputs for all the steps. with Pipeline(name=\"my-pipeline\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False) # (1)\n - Pipeline cache is disabled
In addition, the cache can be enabled/disabled at Step level using its use_cache attribute. If True , then the outputs of the step will be reused in the new pipeline execution. If False , then the step will be re-executed to generate new outputs. If the cache of one step is disabled and the outputs have to be regenerated, then the outputs of the steps that depend on this step will also be regenerated. with Pipeline(name=\"writting-assistant\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"How much is 2+2?\"\n }\n ]\n )\n\n generation = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"Qwen/Qwen2.5-72B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 512,\n },\n ),\n use_cache=False # (1)\n )\n\n load_data >> generation\n\nif __name__ == \"__main__\":\n distiset = pipeline.run()\n - Step cache is disabled and every time the pipeline is executed, this step will be re-executed
"},{"location":"sections/how_to_guides/advanced/caching/#how-a-cache-hit-is-triggered","title":"How a cache hit is triggered","text":"distilabel groups information and data generated by a Pipeline using the name of the pipeline, so the first factor that triggers a cache hit is the name of the pipeline. The second factor, is the Pipeline.signature property. This property returns a hash that is generated using the names of the steps used in the pipeline and their connections. The third factor, is the Pipeline.aggregated_steps_signature property which is used to determine if the new pipeline execution is exactly the same as one of the previous i.e. the pipeline contains exactly the same steps, with exactly the same connections and the steps are using exactly the same parameters. If these three factors are met, then the cache hit is triggered and the pipeline won't get re-executed and instead the function create_distiset will be used to create the resulting Distiset using the outputs of the previous execution, as it can be seen in the following image: If the new pipeline execution have a different Pipeline.aggregated_steps_signature i.e. at least one step has changed its parameters, distilabel will reuse the outputs of the steps that have not changed and re-execute the steps that have changed, as it can be seen in the following image: The same pipeline from above gets executed a third time, but this time the last step text_generation_1 changed, so it's needed to re-execute it. The other steps, as they have not been changed, doesn't need to be re-executed and their outputs are reused. "},{"location":"sections/how_to_guides/advanced/distiset/","title":"Using the Distiset dataset object","text":"A Pipeline in distilabel returns a special type of Hugging Face datasets.DatasetDict which is called Distiset . The Distiset is a dictionary-like object that contains the different configurations generated by the Pipeline , where each configuration corresponds to each leaf step in the DAG built by the Pipeline . Each configuration corresponds to a different subset of the dataset. This is a concept taken from \ud83e\udd17 datasets that lets you upload different configurations of the same dataset within the same repository and can contain different columns i.e. different configurations, which can be seamlessly pushed to the Hugging Face Hub. Below you can find an example of how to create a Distiset object that resembles a datasets.DatasetDict : from datasets import Dataset\nfrom distilabel.distiset import Distiset\n\ndistiset = Distiset(\n {\n \"leaf_step_1\": Dataset.from_dict({\"instruction\": [1, 2, 3]}),\n \"leaf_step_2\": Dataset.from_dict(\n {\"instruction\": [1, 2, 3, 4], \"generation\": [5, 6, 7, 8]}\n ),\n }\n)\n Note If there's only one leaf node, i.e., only one step at the end of the Pipeline , then the configuration name won't be the name of the last step, but it will be set to \"default\" instead, as that's more aligned with standard datasets within the Hugging Face Hub. "},{"location":"sections/how_to_guides/advanced/distiset/#distiset-methods","title":"Distiset methods","text":"We can interact with the different pieces generated by the Pipeline and treat them as different configurations . The Distiset contains just two methods: "},{"location":"sections/how_to_guides/advanced/distiset/#traintest-split","title":"Train/Test split","text":"Create a train/test split partition of the dataset for the different configurations or subsets. >>> distiset.train_test_split(train_size=0.9)\nDistiset({\n leaf_step_1: DatasetDict({\n train: Dataset({\n features: ['instruction'],\n num_rows: 2\n })\n test: Dataset({\n features: ['instruction'],\n num_rows: 1\n })\n })\n leaf_step_2: DatasetDict({\n train: Dataset({\n features: ['instruction', 'generation'],\n num_rows: 3\n })\n test: Dataset({\n features: ['instruction', 'generation'],\n num_rows: 1\n })\n })\n})\n "},{"location":"sections/how_to_guides/advanced/distiset/#push-to-hugging-face-hub","title":"Push to Hugging Face Hub","text":"Push the Distiset to a Hugging Face repository, where each one of the subsets will correspond to a different configuration: distiset.push_to_hub(\n \"my-org/my-dataset\",\n commit_message=\"Initial commit\",\n private=False,\n token=os.getenv(\"HF_TOKEN\"),\n generate_card=True,\n include_script=False\n)\n New since version 1.3.0 Since version 1.3.0 you can automatically push the script that created your pipeline to the same repository. For example, assuming you have a file like the following: sample_pipe.pywith Pipeline() as pipe:\n ...\ndistiset = pipe.run()\ndistiset.push_to_hub(\n \"my-org/my-dataset,\n include_script=True\n)\n After running the command, you could visit the repository and the file sample_pipe.py will be stored to simplify sharing your pipeline with the community. "},{"location":"sections/how_to_guides/advanced/distiset/#custom-docstrings","title":"Custom Docstrings","text":"distilabel contains a custom plugin to automatically generates a gallery for the different components. The information is extracted by parsing the Step 's docstrings. You can take a look at the docstrings in the source code of the UltraFeedback, and take a look at the corresponding entry in the components gallery to see an example of how the docstrings are rendered. If you create your own components and want the Citations automatically rendered in the README card (in case you are sharing your final distiset in the Hugging Face Hub), you may want to add the citation section. This is an example for the MagpieGenerator Task: class MagpieGenerator(GeneratorTask, MagpieBase):\n r\"\"\"Generator task the generates instructions or conversations using Magpie.\n ...\n\n Citations:\n\n ```\n @misc{xu2024magpiealignmentdatasynthesis,\n title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n year={2024},\n eprint={2406.08464},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n url={https://arxiv.org/abs/2406.08464},\n }\n ```\n \"\"\"\n The Citations section can include any number of bibtex references. To define them, you can add as much elements as needed just like in the example: each citation will be a block of the form: ```@misc{...}``` . This information will be automatically used in the README of your Distiset if you decide to call distiset.push_to_hub . Alternatively, if the Citations is not found, but in the References there are found any urls pointing to https://arxiv.org/ , we will try to obtain the Bibtex equivalent automatically. This way, Hugging Face can automatically track the paper for you and it's easier to find other datasets citing the same paper, or directly visiting the paper page. "},{"location":"sections/how_to_guides/advanced/distiset/#save-and-load-from-disk","title":"Save and load from disk","text":"Take into account that these methods work as datasets.load_from_disk and datasets.Dataset.save_to_disk so the arguments are directly passed to those methods. This means you can also make use of storage_options argument to save your Distiset in your cloud provider, including the distilabel artifacts (pipeline.yaml , pipeline.log and the README.md with the dataset card). You can read more in datasets documentation here. Save to diskLoad from disk (local)Load from disk (cloud) Save the Distiset to disk, and optionally (will be done by default) saves the dataset card, the pipeline config file and logs: distiset.save_to_disk(\n \"my-dataset\",\n save_card=True,\n save_pipeline_config=True,\n save_pipeline_log=True\n)\n Load a Distiset that was saved using Distiset.save_to_disk just the same way: distiset = Distiset.load_from_disk(\"my-dataset\")\n Load a Distiset from a remote location, like S3, GCS. You can pass the storage_options argument to authenticate with the cloud provider: distiset = Distiset.load_from_disk(\n \"s3://path/to/my_dataset\", # gcs:// or any filesystem tolerated by fsspec\n storage_options={\n \"key\": os.environ[\"S3_ACCESS_KEY\"],\n \"secret\": os.environ[\"S3_SECRET_KEY\"],\n ...\n }\n)\n Take a look at the remaining arguments at Distiset.save_to_disk and Distiset.load_from_disk . "},{"location":"sections/how_to_guides/advanced/distiset/#dataset-card","title":"Dataset card","text":"Having this special type of dataset comes with an added advantage when calling Distiset.push_to_hub , which is the automatically generated dataset card in the Hugging Face Hub. Note that it is enabled by default, but can be disabled by setting generate_card=False : distiset.push_to_hub(\"my-org/my-dataset\", generate_card=True)\n We will have an automatic dataset card (an example can be seen here) with some handy information like reproducing the Pipeline with the CLI , or examples of the records from the different subsets. "},{"location":"sections/how_to_guides/advanced/distiset/#create_distiset-helper","title":"create_distiset helper","text":"Lastly, we presented in the caching section the create_distiset function, you can take a look at the section to see how to create a Distiset from the cache folder, using the helper function to automatically include all the relevant data. "},{"location":"sections/how_to_guides/advanced/fs_to_pass_data/","title":"Using a file system to pass data of batches between steps","text":"In some situations, it can happen that the batches contains so much data that is faster to write it to disk and read it back in the next step, instead of passing it using the queue. To solve this issue, distilabel uses fsspec to allow providing a file system configuration and whether if this file system should be used to pass data between steps in the run method of the distilabel pipelines: Warning In order to use a specific file system/cloud storage, you will need to install the specific package providing the fsspec implementation for that file system. For instance, to use Google Cloud Storage you will need to install gcsfs : pip install gcsfs\n Check the available implementations: fsspec - Other known implementations from distilabel.pipeline import Pipeline\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n ..., \n storage_parameters={\"path\": \"gcs://my-bucket\"},\n use_fs_to_pass_data=True\n )\n The code above setups a file system (in this case Google Cloud Storage) and sets the flag use_fs_to_pass_data to specify that the data of the batches should be passed to the steps using the file system. The storage_parameters argument is optional, and in the case it's not provided but use_fs_to_pass_data==True , distilabel will use the local file system. Note As GlobalStep s receives all the data from the previous steps in one single batch accumulating all the data, it's very likely that the data of the batch will be too big to be passed using the queue. In this case and even if use_fs_to_pass_data==False , distilabel will use the file system to pass the data to the GlobalStep . "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/","title":"Load groups and execution stages","text":"By default, the distilabel architecture loads all steps of a pipeline at the same time, as they are all supposed to process batches of data in parallel. However, loading all steps at once can waste resources in two scenarios: when using GlobalStep s that must wait for upstream steps to complete before processing data, or when running on machines with limited resources that cannot execute all steps simultaneously. In these cases, steps need to be loaded and executed in distinct load stages. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-stages","title":"Load stages","text":"A load stage represents a point in the pipeline execution where a group of steps are loaded at the same time to process batches in parallel. These stages are required because: - There are some kind of steps like the
GlobalStep s that needs to receive all the data at once from their upstream steps i.e. needs their upstream steps to have finished its execution. It would be wasteful to load a GlobalStep at the same time as other steps of the pipeline as that would take resources (from the machine or cluster running the pipeline) that wouldn't be used until upstream steps have finished. - When running on machines or clusters with limited resources, it may be not possible to load and execute all steps simultaneously as they would need to access the same limited resources (memory, CPU, GPU, etc.).
Having that said, the first element that will create a load stage when executing a pipeline are the GlobalStep , as they mark and divide a pipeline in three stages: one stage with the upstream steps of the global step, one stage with the global step, and one final stage with the downstream steps of the global step. For example, the following pipeline will contain three stages: from typing import TYPE_CHECKING\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts, StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.typing import StepOutput\n\n\n@step(inputs=[\"instruction\"], outputs=[\"instruction2\"])\ndef DummyStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"instruction2\"] = \"miau\"\n yield inputs\n\n\n@step(inputs=[\"instruction\"], outputs=[\"instruction2\"], step_type=\"global\")\ndef GlobalDummyStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"instruction2\"] = \"miau\"\n yield inputs\n\n\nwith Pipeline() as pipeline:\n generator = LoadDataFromDicts(data=[{\"instruction\": \"Hi\"}] * 50)\n dummy_step_0 = DummyStep()\n global_dummy_step = GlobalDummyStep()\n dummy_step_1 = DummyStep()\n\n generator >> dummy_step_0 >> global_dummy_step >> dummy_step_1\n\nif __name__ == \"__main__\":\n load_stages = pipeline.get_load_stages()\n\n for i, steps_stage in enumerate(load_stages[0]):\n print(f\"Stage {i}: {steps_stage}\")\n\n # Output:\n # Stage 0: ['load_data_from_dicts_0', 'dummy_step_0']\n # Stage 1: ['global_dummy_step_0']\n # Stage 2: ['dummy_step_1']\n As we can see, the GlobalStep divided the pipeline execution in three stages. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-groups","title":"Load groups","text":"While GlobalStep s automatically divide pipeline execution into stages, we many need fine-grained control over how steps are loaded and executed within each stage. This is where load groups come in. Load groups allows to specify which steps of the pipeline have to be loaded together within a stage. This is particularly useful when running on resource-constrained environments where all the steps cannot be executed in parallel. Let's see how it works with an example: from datasets import load_dataset\n\nfrom distilabel.llms import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import TextGeneration\n\ndataset = load_dataset(\n \"distilabel-internal-testing/instruction-dataset-mini\", split=\"test\"\n).rename_column(\"prompt\", \"instruction\")\n\nwith Pipeline() as pipeline:\n text_generation_0 = TextGeneration(\n llm=vLLM(\n model=\"HuggingFaceTB/SmolLM2-1.7B-Instruct\",\n extra_kwargs={\"max_model_len\": 1024},\n ),\n resources=StepResources(gpus=1),\n )\n\n text_generation_1 = TextGeneration(\n llm=vLLM(\n model=\"HuggingFaceTB/SmolLM2-1.7B-Instruct\",\n extra_kwargs={\"max_model_len\": 1024},\n ),\n resources=StepResources(gpus=1),\n )\n\nif __name__ == \"__main__\":\n load_stages = pipeline.get_load_stages(load_groups=[[text_generation_1.name]])\n\n for i, steps_stage in enumerate(load_stages[0]):\n print(f\"Stage {i}: {steps_stage}\")\n\n # Output:\n # Stage 0: ['text_generation_0']\n # Stage 1: ['text_generation_1']\n\n distiset = pipeline.run(dataset=dataset, load_groups=[[text_generation_0.name]])\n In this example, we're working with a machine that has a single GPU, but the pipeline includes two instances of TextGeneration tasks both using vLLM and requesting 1 GPU. We cannot execute both steps in parallel. To fix that, we specify in the run method using the load_groups argument that the text_generation_0 step has to be executed in isolation in a stage. This way, we can run the pipeline on a single GPU machine by executing the steps in different stages (sequentially) instead of in parallel. Some key points about load groups: - Load groups are specified as a list of lists, where each inner list represents a group of steps that should be loaded together.
- Same as
GlobalSteps s, the load groups creates a new load stage dividing the pipeline in 3 stages: one for the upstream steps, one for the steps in the load group, and one for the downstream steps. "},{"location":"sections/how_to_guides/advanced/load_groups_and_execution_stages/#load-groups-modes","title":"Load groups modes","text":"In addition, distilabel allows passing some modes to the load_groups argument that will handle the creation of the load groups: \"sequential_step_execution\" : when passed, it will create a load group for each step i.e. the execution of the steps of the pipeline will be sequential. "},{"location":"sections/how_to_guides/advanced/offline_batch_generation/","title":"Offline Batch Generation","text":"The offline batch generation is a feature that some LLM s implemented in distilabel offers, allowing to send the inputs to a LLM-as-a-service platform and waiting for the outputs in a asynchronous manner. LLM-as-a-service platforms offer this feature as it allows them to gather many inputs and creating batches as big as the hardware allows, maximizing the hardware utilization and reducing the cost of the service. In exchange, the user has to wait certain time for the outputs to be ready but the cost per token is usually much lower. distilabel pipelines are able to handle LLM s that offer this feature in the following way: - The first time the pipeline gets executed, the
LLM will send the inputs to the platform. The platform will return jobs ids that can be used later to check the status of the jobs and retrieve the results. The LLM will save these jobs ids in its jobs_ids attribute and raise an special exception DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the Pipeline . The jobs ids will be saved in the pipeline cache, so they can be used in subsequent calls. - The second time and subsequent calls will recover the pipeline execution and the
LLM won't send the inputs again to the platform. This time as it has the jobs_ids it will check if the jobs have finished, and if they have then it will retrieve the results and return the outputs. If they haven't finished, then it will raise again DistilabelOfflineBatchGenerationNotFinishedException again. - In addition, LLMs with offline batch generation can be specified to do polling until the jobs have finished, blocking the pipeline until they are done. If for some reason the polling needs to be stopped, one can press Ctrl+C or Cmd+C depending on your OS (or send a
SIGINT to the main process) which will stop the polling and raise DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the pipeline as described above. Warning In order to recover the pipeline execution and retrieve the results, the pipeline cache must be enabled. If the pipeline cache is disabled, then it will send the inputs again and create different jobs incurring in extra costs. "},{"location":"sections/how_to_guides/advanced/offline_batch_generation/#example-pipeline-using-openaillm-with-offline-batch-generation","title":"Example pipeline using OpenAILLM with offline batch generation","text":"from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline() as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True, # (1)\n )\n )\n\n load_data >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset\",\n \"split\": \"test\",\n \"batch_size\": 500,\n },\n }\n )\n - Indicate that the
OpenAILLM should use offline batch generation. "},{"location":"sections/how_to_guides/advanced/pipeline_requirements/","title":"Add requirements to run a Pipeline","text":"When sharing a Pipeline that contains custom Step s or Task s, you may want to add the specific requirements that are needed to run them. distilabel will take this list of requirements and warn the user if any are missing. Let's see how we can add additional requirements with an example. The first thing we're going to do is to add requirements for our CustomStep . To do so we use the requirements decorator to specify that the step has nltk>=3.8 as dependency (we can use version specifiers). In addition, we're going to specify at Pipeline level that we need distilabel>=1.3.0 to run it. from typing import List\n\nfrom distilabel.steps import Step\nfrom distilabel.steps.base import StepInput\nfrom distilabel.steps.typing import StepOutput\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.utils.requirements import requirements\nfrom distilabel.pipeline import Pipeline\n\n\n@requirements([\"nltk\"])\nclass CustomStep(Step):\n @property\n def inputs(self) -> List[str]:\n return [\"instruction\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"response\"]\n\n def process(self, inputs: StepInput) -> StepOutput: # type: ignore\n for input in inputs:\n input[\"response\"] = nltk.word_tokenize(input)\n yield inputs\n\n\nwith Pipeline(\n name=\"pipeline-with-requirements\", requirements=[\"distilabel>=1.3.0\"]\n) as pipeline:\n loader = LoadDataFromDicts(data=[{\"instruction\": \"sample sentence\"}])\n step1 = CustomStep()\n loader >> step1\n\nif __name__ == \"__main__\":\n pipeline.run()\n Once we call pipeline.run() , if any of the requirements informed at the Step or Pipeline level is missing, a ValueError will be raised telling us that we should install the list of dependencies: >>> pipeline.run()\n[06/27/24 11:07:33] ERROR ['distilabel.pipeline'] Please install the following requirements to run the pipeline: base.py:350\n distilabel>=1.3.0\n...\nValueError: Please install the following requirements to run the pipeline:\ndistilabel>=1.3.0\n "},{"location":"sections/how_to_guides/advanced/saving_step_generated_artifacts/","title":"Saving step generated artifacts","text":"Some Step s might need to produce an auxiliary artifact that is not a result of the computation, but is needed for the computation. For example, the FaissNearestNeighbour needs to create a Faiss index to compute the output of the step which are the top k nearest neighbours for each input. Generating the Faiss index takes time and it could potentially be reused outside of the distilabel pipeline, so it would be a shame not saving it. For this reason, Step s have a method called save_artifact that allows saving artifacts that will be included along the outputs of the pipeline in the generated Distiset . The generated artifacts will be uploaded and saved when using Distiset.push_to_hub or Distiset.save_to_disk respectively. Let's see how to use it with a simple example. from typing import List, TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\nimport matplotlib.pyplot as plt\n\nif TYPE_CHECKING:\n from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"text_character_count\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n character_counts = []\n\n for input in inputs:\n text_character_count = len(input[\"text\"])\n input[\"text_character_count\"] = text_character_count\n character_counts.append(text_character_count)\n\n # Generate plot with the distribution of text character counts\n plt.figure(figsize=(10, 6))\n plt.hist(character_counts, bins=30, edgecolor=\"black\")\n plt.title(\"Distribution of Text Character Counts\")\n plt.xlabel(\"Character Count\")\n plt.ylabel(\"Frequency\")\n\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"text_character_count_distribution\",\n write_function=lambda path: plt.savefig(path / \"figure.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n\n plt.close()\n\n yield inputs\n As it can be seen in the example above, we have created a simple step that counts the number of characters in each input text and generates a histogram with the distribution of the character counts. We save the histogram as an artifact of the step using the save_artifact method. The method takes three arguments: name : The name we want to give to the artifact. write_function : A function that writes the artifact to the desired path. The function will receive a path argument which is a pathlib.Path object pointing to the directory where the artifact should be saved. metadata : A dictionary with metadata about the artifact. This metadata will be saved along with the artifact. Let's execute the step with a simple pipeline and push the resulting Distiset to the Hugging Face Hub: Example full code from typing import TYPE_CHECKING, List\n\nimport matplotlib.pyplot as plt\nfrom datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\n\nif TYPE_CHECKING:\n from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n @property\n def inputs(self) -> List[str]:\n return [\"text\"]\n\n @property\n def outputs(self) -> List[str]:\n return [\"text_character_count\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\": # type: ignore\n character_counts = []\n\n for input in inputs:\n text_character_count = len(input[\"text\"])\n input[\"text_character_count\"] = text_character_count\n character_counts.append(text_character_count)\n\n # Generate plot with the distribution of text character counts\n plt.figure(figsize=(10, 6))\n plt.hist(character_counts, bins=30, edgecolor=\"black\")\n plt.title(\"Distribution of Text Character Counts\")\n plt.xlabel(\"Character Count\")\n plt.ylabel(\"Frequency\")\n\n # Save the plot as an artifact of the step\n self.save_artifact(\n name=\"text_character_count_distribution\",\n write_function=lambda path: plt.savefig(path / \"figure.png\"),\n metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n )\n\n plt.close()\n\n yield inputs\n\n\nwith Pipeline() as pipeline:\n count_text_characters = CountTextCharacters()\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n dataset=load_dataset(\n \"HuggingFaceH4/instruction-dataset\", split=\"test\"\n ).rename_column(\"prompt\", \"text\"),\n )\n\n distiset.push_to_hub(\"distilabel-internal-testing/distilabel-artifacts-example\")\n The generated distilabel-internal-testing/distilabel-artifacts-example dataset repository has a section in its card describing the artifacts generated by the pipeline and the generated plot can be seen here. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/","title":"Scaling and distributing a pipeline with Ray","text":"Although the local Pipeline based on multiprocessing + serving LLMs with an external service is enough for executing most of the pipelines used to create SFT and preference datasets, there are scenarios where you might need to scale your pipeline across multiple machines. In such cases, distilabel leverages Ray to distribute the workload efficiently. This allows you to generate larger datasets, reduce execution time, and maximize resource utilization across a cluster of machines, without needing to change a single line of code. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#relation-between-distilabel-steps-and-ray-actors","title":"Relation between distilabel steps and Ray Actors","text":"A distilabel pipeline consist of several Step s. An Step is a class that defines a basic life-cycle: - It will load or create the resources (LLMs, clients, etc) required to run its logic.
- It will run a loop waiting for incoming batches received using a queue. Once it receives one batch, it will process it and put the processed batch into an output queue.
- When it finish a batch that is the final one or receives a special signal, the loop will finish and the unload logic will be executed.
So an Step needs to maintain a minimum state and the best way to do that with Ray is using actors. graph TD\n A[Step] -->|has| B[Multiple Replicas]\n B -->|wrapped in| C[Ray Actor]\n C -->|maintains| D[Step Replica State]\n C -->|executes| E[Step Lifecycle]\n E -->|1. Load/Create Resources| F[LLMs, Clients, etc.]\n E -->|2. Process batches from| G[Input Queue]\n E -->|3. Processed batches are put in| H[Output Queue]\n E -->|4. Unload| I[Cleanup]\n "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-pipeline-with-ray","title":"Executing a pipeline with Ray","text":"The recommended way to execute a distilabel pipeline using Ray is using the Ray Jobs API. Before jumping on the explanation, let's first install the prerequisites: pip install distilabel[ray]\n Tip It's recommended to create a virtual environment. For the purpose of explaining how to execute a pipeline with Ray, we'll use the following pipeline throughout the examples: from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n )\n )\n\n load_data_from_hub >> text_generation\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data_from_hub.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n }\n },\n \"resources\": {\"replicas\": 2, \"gpus\": 1}, # (1)\n },\n }\n )\n\n distiset.push_to_hub(\n \"<YOUR_HF_USERNAME_OR_ORGANIZATION>/text-generation-distilabel-ray\" # (2)\n )\n - We're setting resources for the
text_generation step and defining that we want two replicas and one GPU per replica. distilabel will create two replicas of the step i.e. two actors in the Ray cluster, and each actor will request to be allocated in a node of the cluster that have at least one GPU. You can read more about how Ray manages the resources here. - You should modify this and add your user or organization on the Hugging Face Hub.
It's a basic pipeline with just two steps: one to load a dataset from the Hub with an instruction column and one to generate a response for that instruction using Llama 3 8B Instruct with vLLM. Simple but enough to demonstrate how to distribute and scale the workload using a Ray cluster! "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#using-ray-jobs-api","title":"Using Ray Jobs API","text":"If you don't know the Ray Jobs API then it's recommended to read Ray Jobs Overview. Quick summary: Ray Jobs is the recommended way to execute a job in a Ray cluster as it will handle packaging, deploying and managing the Ray application. To execute the pipeline above, we first need to create a directory (kind of a package) with the pipeline script (or scripts) that we will submit to the Ray cluster: mkdir ray-pipeline\n The content of the directory ray-pipeline should be: ray-pipeline/\n\u251c\u2500\u2500 pipeline.py\n\u2514\u2500\u2500 runtime_env.yaml\n The first file contains the code of the pipeline, while the second one (runtime_env.yaml ) is a specific Ray file containing the environment dependencies required to run the job: pip:\n - distilabel[ray,vllm] >= 1.3.0\nenv_vars:\n HF_TOKEN: <YOUR_HF_TOKEN>\n With this file we're basically informing to the Ray cluster that it will have to install distilabel with the vllm and ray extra dependencies to be able to run the job. In addition, we're defining the HF_TOKEN environment variable that will be used (by the push_to_hub method) to upload the resulting dataset to the Hugging Face Hub. After that, we can proceed to execute the ray command that will submit the job to the Ray cluster: ray job submit \\\n --address http://localhost:8265 \\\n --working-dir ray-pipeline \\\n --runtime-env ray-pipeline/runtime_env.yaml -- python pipeline.py\n What this will do, it's to basically upload the --working-dir to the Ray cluster, install the dependencies and then execute the python pipeline.py command from the head node. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#file-system-requirements","title":"File system requirements","text":"As described in Using a file system to pass data to steps, distilabel relies on the file system to pass the data to the GlobalStep s, so if the pipeline to be executed in the Ray cluster have any GlobalStep or do you want to set the use_fs_to_pass_data=True of the run method, then you will need to setup a file system to which all the nodes of the Ray cluster have access: if __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={...},\n storage_parameters={\"path\": \"file:///mnt/data\"}, # (1)\n use_fs_to_pass_data=True,\n )\n - All the nodes of the Ray cluster should have access to
/mnt/data . "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-raypipeline-in-a-cluster-with-slurm","title":"Executing a RayPipeline in a cluster with Slurm","text":"If you have access to an HPC, then you're probably also a user of Slurm, a workload manager typically used on HPCs. We can create Slurm job that takes some nodes and deploy a Ray cluster to run a distributed distilabel pipeline: #!/bin/bash\n#SBATCH --job-name=distilabel-ray-text-generation\n#SBATCH --partition=your-partition\n#SBATCH --qos=normal\n#SBATCH --nodes=2 # (1)\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1 # (2)\n#SBATCH --gpus-per-node=1 # (3)\n#SBATCH --time=0:30:00\n\nset -ex\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\n# Activate virtual environment\nsource /path/to/virtualenv/.venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node (just in case the default one is not writable)\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nOUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$head_node\" \\ # (4)\n ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n --dashboard-host=0.0.0.0 \\\n --dashboard-port=8265 \\\n --temp-dir=\"$head_tmp_dir\" \\\n --block &\n\n# Give some time to head node to start...\necho \"Waiting a bit before starting worker nodes...\"\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n node_i=${nodes_array[$i]}\n worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n echo \"Starting WORKER $i at $node_i\"\n OUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n ray start --address \"$ip_head\" \\\n --temp-dir=\"$worker_tmp_dir\" \\\n --block &\n sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\necho \"Waiting a bit before submitting the job...\"\nsleep 60\n\n# Finally submit the job to the cluster\nray job submit --address http://localhost:8265 --working-dir ray-pipeline -- python -u pipeline.py\n - In this case, we just want two nodes: one to run the Ray head node and one to run a worker.
- We just want to run a task per node i.e. the Ray command that starts the head/worker node.
- We have selected 1 GPU per node, but we could have selected more depending on the pipeline.
- We need to set the environment variable
OUTLINES_CACHE_DIR to /tmp/.outlines to avoid issues with the nodes trying to read/write the same outlines cache files, which is not possible. "},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#vllm-and-tensor_parallel_size","title":"vLLM and tensor_parallel_size ","text":"In order to use vLLM multi-GPU and multi-node capabilities with ray , we need to do a few changes in the example pipeline from above. The first change needed is to specify a value for tensor_parallel_size aka \"In how many GPUs do I want you to load the model\", and the second one is to define ray as the distributed_executor_backend as the default one in vLLM is to use multiprocessing : with Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n text_generation = TextGeneration(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n extra_kwargs={\n \"tensor_parallel_size\": 8,\n \"distributed_executor_backend\": \"ray\",\n }\n )\n )\n\n load_data_from_hub >> text_generation\n More information about distributed inference with vLLM can be found here: vLLM - Distributed Serving "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/","title":"Serving an LLM for sharing it between several Task s","text":"It's very common to want to use the same LLM for several Task s in a pipeline. To avoid loading the LLM as many times as the number of Task s and avoid wasting resources, it's recommended to serve the model using solutions like text-generation-inference or vLLM , and then use an AsyncLLM compatible client like InferenceEndpointsLLM or OpenAILLM to communicate with the server respectively. "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-text-generation-inference","title":"Serving LLMs using text-generation-inference ","text":"model=meta-llama/Meta-Llama-3-8B-Instruct\nvolume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n\ndocker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \\\n -e HUGGING_FACE_HUB_TOKEN=<secret> \\\n ghcr.io/huggingface/text-generation-inference:2.0.4 \\\n --model-id $model\n Note The bash command above has been copy-pasted from the official docs text-generation-inference. Please refer to the official docs for more information. And then we can use InferenceEndpointsLLM with base_url=http://localhost:8080 (pointing to our TGI local deployment): from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n )\n\n # `base_url` points to the address of the `TGI` serving the LLM\n llm = InferenceEndpointsLLM(base_url=\"http://192.168.1.138:8080\")\n\n text_generation = TextGeneration(\n llm=llm,\n num_generations=3,\n group_generations=True,\n output_mappings={\"generation\": \"generations\"},\n )\n\n ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n load_data >> text_generation >> ultrafeedback\n "},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-vllm","title":"Serving LLMs using vLLM ","text":"docker run --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n --env \"HUGGING_FACE_HUB_TOKEN=<secret>\" \\\n -p 8000:8000 \\\n --ipc=host \\\n vllm/vllm-openai:latest \\\n --model meta-llama/Meta-Llama-3-8B-Instruct\n Note The bash command above has been copy-pasted from the official docs vLLM. Please refer to the official docs for more information. And then we can use OpenAILLM with base_url=http://localhost:8000 (pointing to our vLLM local deployment): from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n )\n\n # `base_url` points to the address of the `vLLM` serving the LLM\n llm = OpenAILLM(base_url=\"http://192.168.1.138:8000\", model=\"\")\n\n text_generation = TextGeneration(\n llm=llm,\n num_generations=3,\n group_generations=True,\n output_mappings={\"generation\": \"generations\"},\n )\n\n ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n load_data >> text_generation >> ultrafeedback\n "},{"location":"sections/how_to_guides/advanced/structured_generation/","title":"Structured data generation","text":"Distilabel has integrations with relevant libraries to generate structured text i.e. to guide the LLM towards the generation of structured outputs following a JSON schema, a regex, etc. "},{"location":"sections/how_to_guides/advanced/structured_generation/#outlines","title":"Outlines","text":"Distilabel integrates outlines within some LLM subclasses. At the moment, the following LLMs integrated with outlines are supported in distilabel : TransformersLLM , vLLM or LlamaCppLLM , so that anyone can generate structured outputs in the form of JSON or a parseable regex. The LLM has an argument named structured_output 1 that determines how we can generate structured outputs with it, let's see an example using LlamaCppLLM . Note For outlines integration to work you may need to install the corresponding dependencies: pip install distilabel[outlines]\n "},{"location":"sections/how_to_guides/advanced/structured_generation/#json","title":"JSON","text":"We will start with a JSON example, where we initially define a pydantic.BaseModel schema to guide the generation of the structured output. Note Take a look at StructuredOutputType to see the expected format of the structured_output dict variable. from pydantic import BaseModel\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n And then we provide that schema to the structured_output argument of the LLM. from distilabel.models import LlamaCppLLM\n\nllm = LlamaCppLLM(\n model_path=\"./openhermes-2.5-mistral-7b.Q4_K_M.gguf\" # (1)\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": User},\n)\nllm.load()\n - We have previously downloaded a GGUF model i.e.
llama.cpp compatible, from the Hugging Face Hub using curl2, but any model can be used as replacement, as long as the model_path argument is updated. And we are ready to pass our instruction as usual: import json\n\nresult = llm.generate(\n [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n max_new_tokens=50\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'Kathy', 'last_name': 'Smith', 'id': 4539210}\nUser(**data)\n# User(name='Kathy', last_name='Smith', id=4539210)\n We get back a Python dictionary (formatted as a string) that we can parse using json.loads , or validate it directly using the User , which si a pydantic.BaseModel instance. "},{"location":"sections/how_to_guides/advanced/structured_generation/#regex","title":"Regex","text":"The following example shows an example of text generation whose output adhere to a regular expression: pattern = r\"<name>(.*?)</name>.*?<grade>(.*?)</grade>\" #\u00a0the same pattern for re.compile\n\nllm=LlamaCppLLM(\n model_path=model_path,\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"regex\", \"schema\": pattern},\n)\nllm.load()\n\nresult = llm.generate(\n [\n [\n {\"role\": \"system\", \"content\": \"You are Simpsons' fans who loves assigning grades from A to E, where A is the best and E is the worst.\"},\n {\"role\": \"user\", \"content\": \"What's up with Homer Simpson?\"}\n ]\n ],\n max_new_tokens=200\n)\n We can check the output by parsing the content using the same pattern we required from the LLM. import re\nmatch = re.search(pattern, result[0][0])\n\nif match:\n name = match.group(1)\n grade = match.group(2)\n print(f\"Name: {name}, Grade: {grade}\")\n# Name: Homer Simpson, Grade: C+\n These were some simple examples, but one can see the options this opens. Tip A full pipeline example can be seen in the following script: examples/structured_generation_with_outlines.py "},{"location":"sections/how_to_guides/advanced/structured_generation/#instructor","title":"Instructor","text":"For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like outlines does, but thanks to instructor we can generate structured output from LLM providers based on pydantic.BaseModel objects. We have integrated instructor to deal with the AsyncLLM . Note For instructor integration to work you may need to install the corresponding dependencies: pip install distilabel[instructor]\n Note Take a look at InstructorStructuredOutputType to see the expected format of the structured_output dict variable. The following is the same example you can see with outlines 's JSON section for comparison purposes. from pydantic import BaseModel\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n And then we provide that schema to the structured_output argument of the LLM: Note In this example we are using Meta Llama 3.1 8B Instruct, keep in mind not all the models support structured outputs. from distilabel.models import MistralLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\"schema\": User}\n)\nllm.load()\n And we are ready to pass our instructions as usual: import json\n\nresult = llm.generate(\n [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n max_new_tokens=256\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'John', 'last_name': 'Doe', 'id': 12345}\nUser(**data)\n# User(name='John', last_name='Doe', id=12345)\n We get back a Python dictionary (formatted as a string) that we can parse using json.loads , or validate it directly using the User , which is a pydantic.BaseModel instance. Tip A full pipeline example can be seen in the following script: examples/structured_generation_with_instructor.py "},{"location":"sections/how_to_guides/advanced/structured_generation/#openai-json","title":"OpenAI JSON","text":"OpenAI offers a JSON Mode to deal with structured output via their API, let's see how to make use of them. The JSON mode instructs the model to always return a JSON object following the instruction required. Warning Bear in mind, for this to work, you must instruct the model in some way to generate JSON, either in the system message or in the instruction, as can be seen in the API reference. Contrary to what we have via outlines , JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. More information can be found in the OpenAI documentation. Other than the reference to generating JSON, to ensure the model generates parseable JSON we can pass the argument response_format=\"json\" 3: from distilabel.models import OpenAILLM\nllm = OpenAILLM(model=\"gpt4-turbo\", api_key=\"api.key\")\nllm.generate(..., response_format=\"json\")\n -
You can check the variable type by importing it from: from distilabel.steps.tasks.structured_outputs.outlines import StructuredOutputType\n \u21a9 -
Download the model with curl: curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n \u21a9 -
Keep in mind that to interact with this response_format argument in a pipeline, you will have to pass it via the generation_kwargs : # Assuming a pipeline is already defined, and we have a task using OpenAILLM called `task_with_openai`:\npipeline.run(\n parameters={\n \"task_with_openai\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"response_format\": \"json\"\n }\n }\n }\n }\n)\n \u21a9 "},{"location":"sections/how_to_guides/advanced/cli/","title":"Command Line Interface (CLI)","text":"Distilabel offers a CLI to explore and re-run existing Pipeline dumps, meaning that an existing dump can be explored to see the steps, how those are connected, the runtime parameters used, and also re-run it with the same or different runtime parameters, respectively. "},{"location":"sections/how_to_guides/advanced/cli/#available-commands","title":"Available commands","text":"The only available command as of the current version of distilabel is distilabel pipeline . $ distilabel pipeline --help\n\n Usage: distilabel pipeline [OPTIONS] COMMAND [ARGS]...\n\n Commands to run and inspect Distilabel pipelines.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n\u256d\u2500 Commands \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 info Get information about a Distilabel pipeline. \u2502\n\u2502 run Run a Distilabel pipeline. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n So on, distilabel pipeline has two subcommands: info and run , as described below. Note that for testing purposes we will be using the following dataset. "},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-info","title":"distilabel pipeline info ","text":"$ distilabel pipeline info --help\n\n Usage: distilabel pipeline info [OPTIONS]\n\n Get information about a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 * --config TEXT Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502 [default: None] \u2502\n\u2502 [required] \u2502\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n As we can see from the help message, we need to pass either a Path or a URL . This second option comes handy for datasets stored in Hugging Face Hub, for example: distilabel pipeline info --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\"\n If we take a look: The pipeline information includes the steps used in the Pipeline along with the Runtime Parameter that was used, as well as a description of each of them, and also the connections between these steps. These can be helpful to explore the Pipeline locally. "},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run","title":"distilabel pipeline run ","text":"We can also run a Pipeline from the CLI just pointing to the same pipeline.yaml file or an URL pointing to it and calling distilabel pipeline run . Alternatively, an URL pointing to a Python script containing a distilabel pipeline can be used: $ distilabel pipeline run --help\n\n Usage: distilabel pipeline run [OPTIONS]\n\n Run a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --param PARSE_RUNTIME_PARAM [default: (dynamic)] \u2502\n\u2502 --config TEXT Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502 [default: None] \u2502\n\u2502 --script TEXT URL pointing to a python script containing a distilabel \u2502\n\u2502 pipeline. \u2502\n\u2502 [default: None] \u2502\n\u2502 --pipeline-variable-name TEXT Name of the pipeline in a script. I.e. the 'pipeline' \u2502\n\u2502 variable in `with Pipeline(...) as pipeline:...`. \u2502\n\u2502 [default: pipeline] \u2502\n\u2502 --ignore-cache --no-ignore-cache Whether to ignore the cache and re-run the pipeline from \u2502\n\u2502 scratch. \u2502\n\u2502 [default: no-ignore-cache] \u2502\n\u2502 --repo-id TEXT The Hugging Face Hub repository ID to push the resulting \u2502\n\u2502 dataset to. \u2502\n\u2502 [default: None] \u2502\n\u2502 --commit-message TEXT The commit message to use when pushing the dataset. \u2502\n\u2502 [default: None] \u2502\n\u2502 --private --no-private Whether to make the resulting dataset private on the Hub. \u2502\n\u2502 [default: no-private] \u2502\n\u2502 --token TEXT The Hugging Face Hub API token to use when pushing the \u2502\n\u2502 dataset. \u2502\n\u2502 [default: None] \u2502\n\u2502 --help Show this message and exit. \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n Using --config option, we must pass a path with a pipeline.yaml file. To specify the runtime parameters of the steps we will need to use the --param option and the value of the parameter in the following format: distilabel pipeline run --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\" \\\n --param load_dataset.repo_id=distilabel-internal-testing/instruction-dataset-mini \\\n --param load_dataset.split=test \\\n --param generate_with_gpt35.llm.generation_kwargs.max_new_tokens=512 \\\n --param generate_with_gpt35.llm.generation_kwargs.temperature=0.7 \\\n --param to_argilla.dataset_name=text_generation_with_gpt35 \\\n --param to_argilla.dataset_workspace=admin\n Or using --script we can pass directly a remote python script (keep in mind --config and --script are exclusive): distilabel pipeline run --script \"https://huggingface.co/datasets/distilabel-internal-testing/pipe_nothing_test/raw/main/pipe_nothing.py\"\n You can also pass runtime parameters to the python script as we saw with --config option. Again, this helps with the reproducibility of the results, and simplifies sharing not only the final dataset but also the process to generate it. "},{"location":"sections/how_to_guides/basic/llm/","title":"Executing Tasks with LLMs","text":""},{"location":"sections/how_to_guides/basic/llm/#working-with-llms","title":"Working with LLMs","text":"LLM subclasses are designed to be used within a Task, but they can also be used standalone. from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n)\nllm.load()\n\nllm.generate_outputs(\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [\n# {\n# \"generations\": [\n# \"The capital of Spain is Madrid.\"\n# ],\n# \"statistics\": {\n# \"input_tokens\": [\n# 43\n# ],\n# \"output_tokens\": [\n# 8\n# ]\n# }\n# }\n# ]\n Note Always call the LLM.load or Task.load method when using LLMs standalone or as part of a Task . If using a Pipeline , this is done automatically in Pipeline.run() . New in version 1.5.0 Since version 1.5.0 the LLM output is a list of dictionaries (one per item in the inputs ), each containing generations , that reports the text returned by the LLM , and a statistics field that will store statistics related to the LLM generation. Initially, this will include input_tokens and output_tokens when available, which will be obtained via the API when available, or if a tokenizer is available for the model used, using the tokenizer for the model. This data will be moved by the corresponding Task during the pipeline processing and moved to distilabel_metadata so we can operate on this data if we want, like for example computing the number of tokens per dataset. To access to the previous result one just has to access to the generations in the resulting dictionary: result[0][\"generations\"] . "},{"location":"sections/how_to_guides/basic/llm/#offline-batch-generation","title":"Offline Batch Generation","text":"By default, all LLM s will generate text in a synchronous manner i.e. send inputs using generate_outputs method that will get blocked until outputs are generated. There are some LLM s (such as OpenAILLM) that implements what we denote as offline batch generation, which allows to send the inputs to the LLM-as-a-service which will generate the outputs asynchronously and give us a job id that we can use later to check the status and retrieve the generated outputs when they are ready. LLM-as-a-service platforms offers this feature as a way to save costs in exchange of waiting for the outputs to be generated. To use this feature in distilabel the only thing we need to do is to set the use_offline_batch_generation attribute to True when creating the LLM instance: from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n model=\"gpt-4o\",\n use_offline_batch_generation=True,\n)\n\nllm.load()\n\nllm.jobs_ids # (1)\n# None\n\nllm.generate_outputs( # (2)\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# DistilabelOfflineBatchGenerationNotFinishedException: Batch generation with jobs_ids=('batch_OGB4VjKpu2ay9nz3iiFJxt5H',) is not finished\n\nllm.jobs_ids # (3)\n# ('batch_OGB4VjKpu2ay9nz3iiFJxt5H',)\n\n\nllm.generate_outputs( # (4)\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [{'generations': ['The capital of Spain is Madrid.'],\n# 'statistics': {'input_tokens': [13], 'output_tokens': [7]}}]\n - At first the
jobs_ids attribute is None . - The first call to
generate_outputs will send the inputs to the LLM-as-a-service and return a DistilabelOfflineBatchGenerationNotFinishedException since the outputs are not ready yet. - After the first call to
generate_outputs the jobs_ids attribute will contain the job ids created for generating the outputs. - The second call or subsequent calls to
generate_outputs will return the outputs if they are ready or raise a DistilabelOfflineBatchGenerationNotFinishedException if they are not ready yet. The offline_batch_generation_block_until_done attribute can be used to block the generate_outputs method until the outputs are ready polling the platform the specified amount of seconds. from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n model=\"gpt-4o\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\nllm.load()\n\nllm.generate_outputs(\n inputs=[\n [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n ],\n)\n# [{'generations': ['The capital of Spain is Madrid.'],\n# 'statistics': {'input_tokens': [13], 'output_tokens': [7]}}]\n "},{"location":"sections/how_to_guides/basic/llm/#within-a-task","title":"Within a Task","text":"Pass the LLM as an argument to the Task , and the task will handle the rest. from distilabel.models import OpenAILLM\nfrom distilabel.steps.tasks import TextGeneration\n\nllm = OpenAILLM(model=\"gpt-4o-mini\")\ntask = TextGeneration(name=\"text_generation\", llm=llm)\n\ntask.load()\n\nnext(task.process(inputs=[{\"instruction\": \"What's the capital of Spain?\"}]))\n# [{'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text_generation': 'The capital of Spain is Madrid.',\n# 'raw_input_text_generation': [{'role': 'user',\n# 'content': \"What's the capital of Spain?\"}],\n# 'statistics_text_generation': {'input_tokens': 13, 'output_tokens': 7}},\n# 'model_name': 'gpt-4o-mini'}]\n Note As mentioned in Working with LLMs section, the generation of an LLM is automatically moved to distilabel_metadata to avoid interference with the common workflow, so the addition of the statistics it's an extra component available for the user, but nothing has to be changed in the defined pipelines. "},{"location":"sections/how_to_guides/basic/llm/#runtime-parameters","title":"Runtime Parameters","text":"LLMs can have runtime parameters, such as generation_kwargs , provided via the Pipeline.run() method using the params argument. Note Runtime parameters can differ between LLM subclasses, caused by the different functionalities offered by the LLM providers. from distilabel.pipeline import Pipeline\nfrom distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-pipeline\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_dataset\",\n data=[{\"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\"}],\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=OpenAILLM(model=\"gpt-4o-mini\"),\n )\n\n load_dataset >> text_generation\n\nif __name__ == \"__main__\":\n pipeline.run(\n parameters={\n text_generation.name: {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.3}}},\n },\n )\n "},{"location":"sections/how_to_guides/basic/llm/#creating-custom-llms","title":"Creating custom LLMs","text":"To create custom LLMs, subclass either LLM for synchronous or AsyncLLM for asynchronous LLMs. Implement the following methods: -
model_name : A property containing the model's name. -
generate : A method that takes a list of prompts and returns generated texts. -
agenerate : A method that takes a single prompt and returns generated texts. This method is used within the generate method of the AsyncLLM class. -
(optional) get_last_hidden_state : is a method that will take a list of prompts and return a list of hidden states. This method is optional and will be used by some tasks such as the GenerateEmbeddings task. Custom LLMCustom AsyncLLM from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import LLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomLLM(LLM):\n @property\n def model_name(self) -> str:\n return \"my-model\"\n\n @validate_call\n def generate(self, inputs: List[ChatType], num_generations: int = 1, **kwargs: Any) -> List[GenerateOutput]:\n for _ in range(num_generations):\n ...\n\n def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n ...\n from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import AsyncLLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomAsyncLLM(AsyncLLM):\n @property\n def model_name(self) -> str:\n return \"my-model\"\n\n @validate_call\n async def agenerate(self, input: ChatType, num_generations: int = 1, **kwargs: Any) -> GenerateOutput:\n for _ in range(num_generations):\n ...\n\n def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n ...\n generate and agenerate keyword arguments (but input and num_generations ) are considered as RuntimeParameter s, so a value can be passed to them via the parameters argument of the Pipeline.run method. Note To have the arguments of the generate and agenerate coerced to the expected types, the validate_call decorator is used, which will automatically coerce the arguments to the expected types, and raise an error if the types are not correct. This is specially useful when providing a value for an argument of generate or agenerate from the CLI, since the CLI will always provide the arguments as strings. Warning Additional LLMs created in distilabel will have to take into account how the statistics are generated to properly include them in the LLM output. "},{"location":"sections/how_to_guides/basic/llm/#available-llms","title":"Available LLMs","text":"Our LLM gallery shows a list of the available LLMs that can be used within the distilabel library. "},{"location":"sections/how_to_guides/basic/pipeline/","title":"Execute Steps and Tasks in a Pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#how-to-create-a-pipeline","title":"How to create a pipeline","text":"Pipeline organise the Steps and Tasks in a sequence, where the output of one step is the input of the next one. A Pipeline should be created by making use of the context manager along with passing a name, and optionally a description. from distilabel.pipeline import Pipeline\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n "},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-stepconnect-method","title":"Connecting steps with the Step.connect method","text":"Now, we can define the steps of our Pipeline . Note Steps without predecessors (i.e. root steps), need to be GeneratorStep s such as LoadDataFromDicts or LoadDataFromHub . After this, other steps can be defined. from distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n ...\n Easily load your datasets If you are already used to work with Hugging Face's Dataset via load_dataset or pd.DataFrame , you can create the GeneratorStep directly from the dataset (or dataframe), and create the step with the help of make_generator_step : From a list of dictsFrom datasets.Dataset From pd.DataFrame from distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = [{\"instruction\": \"Tell me a joke.\"}]\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n from datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = load_dataset(\n \"DIBT/10k_prompts_ranked\",\n split=\"train\"\n).filter(\n lambda r: r[\"avg_rating\"]>=4 and r[\"num_responses\"]>=2\n).select(range(500))\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n import pandas as pd\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = pd.read_csv(\"path/to/dataset.csv\")\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n ...\n Next, we will use prompt column from the dataset obtained through LoadDataFromHub and use several LLM s to execute a TextGeneration task. We will also use the Task.connect() method to connect the steps, so the output of one step is the input of the next one. Note The order of the execution of the steps will be determined by the connections of the steps. In this case, the TextGeneration tasks will be executed after the LoadDataFromHub step. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n task.connect(load_dataset)\n\n ...\n For each row of the dataset, the TextGeneration task will generate a text based on the instruction column and the LLM model, and store the result (a single string) in a new column called generation . Because we need to have the response s in the same column, we will add GroupColumns to combine them all in the same column as a list of strings. Note In this case, the GroupColumns tasks will be executed after all TextGeneration steps. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n load_dataset.connect(task)\n task.connect(combine_generations)\n "},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-operator","title":"Connecting steps with the >> operator","text":"Besides the Step.connect method: step1.connect(step2) , there's an alternative way by making use of the >> operator. We can connect steps in a more readable way, and it's also possible to connect multiple steps at once. Step per stepMultiple steps at once Each call to step1.connect(step2) has been exchanged by step1 >> step2 within the loop. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n load_dataset >> task >> combine_generations\n Each task is first appended to a list, and then all the calls to connections are done in a single call. from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n load_dataset >> tasks >> combine_generations\n "},{"location":"sections/how_to_guides/basic/pipeline/#routing-batches-to-specific-downstream-steps","title":"Routing batches to specific downstream steps","text":"In some pipelines, you may want to send batches from a single upstream step to specific downstream steps based on certain conditions. To achieve this, you can use a routing_batch_function . This function takes a list of downstream steps and returns a list of step names to which each batch should be routed. Let's update the example above to route the batches loaded by the LoadDataFromHub step to just 2 of the TextGeneration tasks. First, we will create our custom routing_batch_function , and then we will update the pipeline to use it: import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n return random.sample(steps, 2)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n load_dataset >> sample_two_steps >> tasks >> combine_generations\n The routing_batch_function that we just built is a common one, so distilabel comes with a builtin function that can be used to achieve the same behavior: from distilable.pipeline import sample_n_steps\n\nsample_two_steps = sample_n_steps(2)\n "},{"location":"sections/how_to_guides/basic/pipeline/#running-the-pipeline","title":"Running the pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#pipelinedry_run","title":"Pipeline.dry_run","text":"Before running the Pipeline we can check if the pipeline is valid using the Pipeline.dry_run() method. It takes the same parameters as the run method which we will discuss in the following section, plus the batch_size we want the dry run to use (by default set to 1). with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.dry_run(parameters=..., batch_size=1)\n "},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun","title":"Pipeline.run","text":"After testing, we can now execute the full Pipeline using the Pipeline.run() method. with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n ...\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n \"load_dataset\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n \"text_generation_with_gpt-4-0125-preview\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_mistral-large-2402\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_gemini-1.0-pro\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n But if we run the pipeline above, we will see that the run method will fail: ValueError: Step 'text_generation_with_gpt-4-0125-preview' requires inputs ['instruction'], but only the inputs=['prompt', 'completion', 'meta'] are available, which means that the inputs=['instruction'] are missing or not available\nwhen the step gets to be executed in the pipeline. Please make sure previous steps to 'text_generation_with_gpt-4-0125-preview' are generating the required inputs.\n This is because, before actually running the pipeline, we must ensure each step has the necessary input columns to be executed. In this case, the TextGeneration task requires the instruction column, but the LoadDataFromHub step generates the prompt column. To solve this, we can use the output_mappings or input_mapping arguments of individual Step s, to map columns from one step to another. with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"}\n )\n\n ...\n If we execute the pipeline again, it will run successfully and we will have a Distiset with the outputs of all the leaf steps of the pipeline which we can push to the Hugging Face Hub. if __name__ == \"__main__\":\n distiset = pipeline.run(...)\n distiset.push_to_hub(\"distilabel-internal-testing/instruction-dataset-mini-with-generations\")\n "},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun-with-a-dataset","title":"Pipeline.run with a dataset","text":"Note that in most cases if you don't need the extra flexibility the GeneratorSteps bring you, you can create a dataset as you would normally do and pass it to the Pipeline.run method directly. Look at the highlighted lines to see the updated lines: import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n return random.sample(steps, 2)\n\ndataset = load_dataset(\n \"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\"\n)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n tasks = []\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n tasks.append(\n TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n sample_two_steps >> tasks >> combine_generations\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n dataset=dataset,\n parameters=...\n )\n "},{"location":"sections/how_to_guides/basic/pipeline/#stopping-the-pipeline","title":"Stopping the pipeline","text":"In case you want to stop the pipeline while it's running, you can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process), and the outputs will be stored in the cache. Pressing an additional time will force the pipeline to stop its execution, but this can lead to losing the generated outputs for certain batches. "},{"location":"sections/how_to_guides/basic/pipeline/#cache","title":"Cache","text":"If for some reason, the pipeline execution stops (for example by pressing Ctrl+C ), the state of the pipeline and the outputs will be stored in the cache, so we can resume the pipeline execution from the point where it was stopped. If we want to force the pipeline to run again without can, then we can use the use_cache argument of the Pipeline.run() method: if __name__ == \"__main__\":\n distiset = pipeline.run(parameters={...}, use_cache=False)\n Note For more information on caching, we refer the reader to the caching section. "},{"location":"sections/how_to_guides/basic/pipeline/#adjusting-the-batch-size-for-each-step","title":"Adjusting the batch size for each step","text":"Memory issues can arise when processing large datasets or when using large models. To avoid this, we can use the input_batch_size argument of individual tasks. TextGeneration task will receive 5 dictionaries, while the LoadDataFromHub step will send 10 dictionaries per batch: from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n batch_size=10\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.5-pro\"),\n ):\n task = TextGeneration(\n name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\",\n llm=llm,\n input_batch_size=5,\n )\n\n ...\n "},{"location":"sections/how_to_guides/basic/pipeline/#serializing-the-pipeline","title":"Serializing the pipeline","text":"Sharing a pipeline with others is very easy, as we can serialize the pipeline object using the save method. We can save the pipeline in different formats, such as yaml or json : yamljson if __name__ == \"__main__\":\n pipeline.save(\"pipeline.yaml\", format=\"yaml\")\n if __name__ == \"__main__\":\n pipeline.save(\"pipeline.json\", format=\"json\")\n To load the pipeline, we can use the from_yaml or from_json methods: yamljson pipeline = Pipeline.from_yaml(\"pipeline.yaml\")\n pipeline = Pipeline.from_json(\"pipeline.json\")\n Serializing the pipeline is very useful when we want to share the pipeline with others, or when we want to store the pipeline for future use. It can even be hosted online, so the pipeline can be executed directly using the CLI. "},{"location":"sections/how_to_guides/basic/pipeline/#visualizing-the-pipeline","title":"Visualizing the pipeline","text":"We can visualize the pipeline using the Pipeline.draw() method. This will create a mermaid graph, and return the path to the image. path_to_image = pipeline.draw(\n top_to_bottom=True,\n show_edge_labels=True,\n)\n Within notebooks, we can simply call pipeline and the graph will be displayed. Alternatively, we can use the Pipeline.draw() method to have more control over the graph visualization and use IPython to display it. from IPython.display import Image, display\n\ndisplay(Image(path_to_image))\n Let's now see how the pipeline of the fully working example looks like. "},{"location":"sections/how_to_guides/basic/pipeline/#fully-working-example","title":"Fully working example","text":"To sum up, here is the full code of the pipeline we have created in this section. Note that you will need to change the name of the Hugging Face repository where the resulting will be pushed, set OPENAI_API_KEY environment variable, set MISTRAL_API_KEY and have gcloud installed and configured: Code from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n combine_generations = GroupColumns(\n name=\"combine_generations\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n for llm in (\n OpenAILLM(model=\"gpt-4-0125-preview\"),\n MistralLLM(model=\"mistral-large-2402\"),\n VertexAILLM(model=\"gemini-1.0-pro\"),\n ):\n task = TextGeneration(\n name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\", llm=llm\n )\n load_dataset.connect(task)\n task.connect(combine_generations)\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n \"load_dataset\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n \"text_generation_with_gpt-4-0125-preview\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_mistral-large-2402\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n \"text_generation_with_gemini-1.0-pro\": {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n },\n },\n )\n distiset.push_to_hub(\n \"distilabel-internal-testing/instruction-dataset-mini-with-generations\"\n )\n "},{"location":"sections/how_to_guides/basic/step/","title":"Steps for processing data","text":""},{"location":"sections/how_to_guides/basic/step/#working-with-steps","title":"Working with Steps","text":"The Step is intended to be used within the scope of a Pipeline , which will orchestrate the different steps defined but can also be used standalone. Assuming that we have a Step already defined as it follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass MyStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"input_field\"]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"output_field\"]\n\n def process(self, inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n input[\"output_field\"] = input[\"input_field\"]\n yield inputs\n Then we can use it as follows: step = MyStep(name=\"my-step\")\nstep.load()\n\nnext(step.process([{\"input_field\": \"value\"}]))\n# [{'input_field': 'value', 'output_field': 'value'}]\n Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/step/#arguments","title":"Arguments","text":" -
input_mappings , is a dictionary that maps keys from the input dictionaries to the keys expected by the step. For example, if input_mappings={\"instruction\": \"prompt\"} , means that the input key prompt will be used as the key instruction for current step. -
output_mappings , is a dictionary that can be used to map the outputs of the step to other names. For example, if output_mappings={\"conversation\": \"prompt\"} , means that output key conversation will be renamed to prompt for the next step. -
input_batch_size (by default set to 50), is independent for every step and will determine how many input dictionaries will process at once. "},{"location":"sections/how_to_guides/basic/step/#runtime-parameters","title":"Runtime parameters","text":"Step s can also have RuntimeParameter , which are parameters that can only be used after the pipeline initialisation when calling the Pipeline.run . from distilabel.mixins.runtime_parameters import RuntimeParameter\n\nclass Step(...):\n input_batch_size: RuntimeParameter[PositiveInt] = Field(\n default=DEFAULT_INPUT_BATCH_SIZE,\n description=\"The number of rows that will contain the batches processed by the\"\n \" step.\",\n )\n "},{"location":"sections/how_to_guides/basic/step/#types-of-steps","title":"Types of Steps","text":"There are two special types of Step in distilabel : -
GeneratorStep : is a step that only generates data, and it doesn't need any input data from previous steps and normally is the first node in a Pipeline . More information: Components -> Step - GeneratorStep. -
GlobalStep : is a step with the standard interface i.e. receives inputs and generates outputs, but it processes all the data at once, and often is the final step in the Pipeline . The fact that a GlobalStep requires the previous steps to finish before being able to start. More information: Components - Step - GlobalStep. -
Task , is essentially the same as a default Step , but it relies on an LLM as an attribute, and the process method will be in charge of calling that LLM. More information: Components - Task. "},{"location":"sections/how_to_guides/basic/step/#defining-custom-steps","title":"Defining custom Steps","text":"We can define a custom step by creating a new subclass of the Step and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data. Note The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput . The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from Step Using the @step decorator We can inherit from the Step class and define the inputs , outputs , and process methods as follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n\n def process(self, *inputs: StepInput) -> \"StepOutput\":\n for upstream_step_inputs in inputs:\n ...\n yield item\n\n # When overridden (ideally under the `typing_extensions.override` decorator)\n # @typing_extensions.override\n # def process(self, inputs: StepInput) -> StepOutput:\n # for input in inputs:\n # ...\n # yield inputs\n The @step decorator will take care of the boilerplate code, and will allow to define the inputs , outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom Step subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...])\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n ...\n yield inputs\n\nstep = CustomStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/step/generator_step/","title":"GeneratorStep","text":"The GeneratorStep is a subclass of Step that is intended to be used as the first step within a Pipeline , because it doesn't require input and generates data that can be used by other steps. Alternatively, it can also be used as a standalone. from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n instructions: List[str]\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n if offset:\n self.instructions = self.instructions[offset:]\n\n while self.instructions:\n batch = [\n {\n \"instruction\": instruction\n } for instruction in self.instructions[: self.batch_size]\n ]\n self.instructions = self.instructions[self.batch_size :]\n yield (\n batch,\n True if len(self.instructions) == 0 else False,\n )\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"instruction\"]\n Then we can use it as follows: step = MyGeneratorStep(\n name=\"my-generator-step\",\n instructions=[\"Tell me a joke.\", \"Tell me a story.\"],\n batch_size=1,\n)\nstep.load()\n\nnext(step.process(offset=0))\n# ([{'instruction': 'Tell me a joke.'}], False)\nnext(step.process(offset=1))\n# ([{'instruction': 'Tell me a story.'}], True)\n Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/step/generator_step/#defining-custom-generatorsteps","title":"Defining custom GeneratorSteps","text":"We can define a custom generator step by creating a new subclass of the GeneratorStep and defining the following: -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that yields output data and a boolean flag indicating whether that's the last batch to be generated. Note The default signature for the process method is process(self, offset: int = 0) -> GeneratorStepOutput . The argument offset should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from GeneratorStep Using the @step decorator We can inherit from the GeneratorStep class and define the outputs , and process methods as follows: from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n instructions: List[str]\n\n @override\n def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n The @step decorator will take care of the boilerplate code, and will allow to define the outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GeneratorStep subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import GeneratorStepOutput\n\n@step(outputs=[...], step_type=\"generator\")\ndef CustomGeneratorStep(offset: int = 0) -> \"GeneratorStepOutput\":\n yield (\n ...,\n True if offset == 10 else False,\n )\n\nstep = CustomGeneratorStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/step/global_step/","title":"GlobalStep","text":"The GlobalStep is a subclass of Step that is used to define a step that requires the previous steps to be completed to run, since it will wait until all the input batches are received before running. This step is useful when you need to run a step that requires all the input data to be processed before running. Alternatively, it can also be used as a standalone. "},{"location":"sections/how_to_guides/basic/step/global_step/#defining-custom-globalsteps","title":"Defining custom GlobalSteps","text":"We can define a custom step by creating a new subclass of the GlobalStep and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
process : is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data. Note The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput . The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one. Warning For the custom GlobalStep subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline , the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING , otherwise, the validation and/or serialization will fail. Inherit from GlobalStep Using the @step decorator We can inherit from the GlobalStep class and define the inputs , outputs , and process methods as follows: from typing import TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n @property\n def inputs(self) -> \"StepColumns\":\n ...\n\n @property\n def outputs(self) -> \"StepColumns\":\n ...\n\n def process(self, *inputs: StepInput) -> StepOutput:\n for upstream_step_inputs in inputs:\n for item in input:\n ...\n yield item\n\n # When overridden (ideally under the `typing_extensions.override` decorator)\n # @typing_extensions.override\n # def process(self, inputs: StepInput) -> StepOutput:\n # for input in inputs:\n # ...\n # yield inputs\n The @step decorator will take care of the boilerplate code, and will allow to define the inputs , outputs , and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GlobalStep subclass. from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...], step_type=\"global\")\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n for input in inputs:\n ...\n yield inputs\n\nstep = CustomStep(name=\"my-step\")\n "},{"location":"sections/how_to_guides/basic/task/","title":"Tasks for generating and judging with LLMs","text":""},{"location":"sections/how_to_guides/basic/task/#working-with-tasks","title":"Working with Tasks","text":"The Task is a special kind of Step that includes the LLM as a mandatory argument. As with a Step , it is normally used within a Pipeline but can also be used standalone. For example, the most basic task is the TextGeneration task, which generates text based on a given instruction. from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# \"instruction\": \"What's the capital of Spain?\",\n# \"generation\": \"The capital of Spain is Madrid.\",\n# \"distilabel_metadata\": {\n# \"raw_output_text-generation\": \"The capital of Spain is Madrid.\",\n# \"raw_input_text-generation\": [\n# {\n# \"role\": \"user\",\n# \"content\": \"What's the capital of Spain?\"\n# }\n# ],\n# \"statistics_text-generation\": { # (1)\n# \"input_tokens\": 18,\n# \"output_tokens\": 8\n# }\n# },\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n# }\n# ]\n - The
LLMs will not only return the text but also a statistics_{STEP_NAME} field that will contain statistics related to the generation. If available, at least the input and output tokens will be returned. Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. As shown above, the TextGeneration task adds a generation based on the instruction . New in version 1.2.0 Since version 1.2.0 , we provide some metadata about the LLM call through distilabel_metadata . This can be disabled by setting the add_raw_output attribute to False when creating the task. Additionally, since version 1.4.0 , the formatted input can also be included, which can be helpful when testing custom templates (testing the pipeline using the dry_run method). disable raw input and outputtask = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n add_raw_output=False,\n add_raw_input=False\n)\n New in version 1.5.0 Since version 1.5.0 distilabel_metadata includes a new statistics field out of the box. The generation from the LLM will not only contain the text, but also statistics associated with the text if available, like the input and output tokens. This field will be generated with statistic_{STEP_NAME} to avoid collisions between different steps in the pipeline, similar to how raw_output_{STEP_NAME} works. "},{"location":"sections/how_to_guides/basic/task/#taskprint","title":"Task.print","text":"New in version 1.4.0 New since version 1.4.0 , Task.print Task.print method. The Tasks include a handy method to show what the prompt formatted for an LLM would look like, let's see an example with UltraFeedback , but it applies to any other Task . from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\nuf = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n)\nuf.load()\nuf.print()\n The result will be a rendered prompt, with the System prompt (if contained for the task) and the User prompt, rendered with rich (it will show exactly the same in a jupyter notebook). In case you want to test with a custom input, you can pass an example to the tasks format_input` method (or generate it on your own depending on the task), and pass it to the print method so that it shows your example: uf.print(\n uf.format_input({\"instruction\": \"test\", \"generations\": [\"1\", \"2\"]})\n)\n Using a DummyLLM to avoid loading one In case you don't want to load an LLM to render the template, you can create a dummy one like the ones we could use for testing. from distilabel.models import LLM\nfrom distilabel.models.mixins import MagpieChatTemplateMixin\n\nclass DummyLLM(AsyncLLM, MagpieChatTemplateMixin):\n structured_output: Any = None\n magpie_pre_query_template: str = \"llama3\"\n\n def load(self) -> None:\n pass\n\n @property\n def model_name(self) -> str:\n return \"test\"\n\n def generate(\n self, input: \"FormattedInput\", num_generations: int = 1\n ) -> \"GenerateOutput\":\n return [\"output\" for _ in range(num_generations)]\n You can use this LLM just as any of the other ones to load your task and call print : uf = UltraFeedback(llm=DummyLLM())\nuf.load()\nuf.print()\n Note When creating a custom task, the print method will be available by default, but it is limited to the most common scenarios for the inputs. If you test your new task and find it's not working as expected (for example, if your task contains one input consisting of a list of texts instead of a single one), you should override the _sample_input method. You can inspect the UltraFeedback source code for this. "},{"location":"sections/how_to_guides/basic/task/#specifying-the-number-of-generations-and-grouping-generations","title":"Specifying the number of generations and grouping generations","text":"All the Task s have a num_generations attribute that allows defining the number of generations that we want to have per input. We can update the example above to generate 3 completions per input: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n num_generations=3,\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# },\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# },\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': 'The capital of Spain is Madrid.',\n# 'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# }\n# ]\n In addition, we might want to group the generations in a single output row as maybe one downstream step expects a single row with multiple generations. We can achieve this by setting the group_generations attribute to True : from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n name=\"text-generation\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n num_generations=3,\n group_generations=True\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n# {\n# 'instruction': \"What's the capital of Spain?\",\n# 'generation': ['The capital of Spain is Madrid.', 'The capital of Spain is Madrid.', 'The capital of Spain is Madrid.'],\n# 'distilabel_metadata': [\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n# {'raw_output_text-generation': 'The capital of Spain is Madrid.'}\n# ],\n# 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n# }\n# ]\n "},{"location":"sections/how_to_guides/basic/task/#defining-custom-tasks","title":"Defining custom Tasks","text":"We can define a custom step by creating a new subclass of the Task and defining the following: -
inputs : is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. -
format_input : is a method that receives a dictionary with the input data and returns a ChatType following the chat-completion OpenAI message formatting. -
outputs : is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. This property should always include model_name as one of the outputs since that's automatically injected from the LLM. -
format_output : is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that there's no need to include the model_name in the output. Inherit from Task Using the @task decorator When using the Task class inheritance method for creating a custom task, we can also optionally override the Task.process method to define a more complex processing logic involving an LLM , as the default one just calls the LLM.generate method once previously formatting the input and subsequently formatting the output. For example, EvolInstruct task overrides this method to call the LLM.generate multiple times (one for each evolution). from typing import Any, Dict, List, Union, TYPE_CHECKING\n\nfrom distilabel.steps.tasks import Task\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepColumns\n from distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(Task):\n @property\n def inputs(self) -> \"StepColumns\":\n return [\"input_field\"]\n\n def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n return [\n {\n \"role\": \"user\",\n \"content\": input[\"input_field\"],\n },\n ]\n\n @property\n def outputs(self) -> \"StepColumns\":\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n If your task just needs a system prompt, a user message template and a way to format the output given by the LLM , then you can use the @task decorator to avoid writing too much boilerplate code. from typing import Any, Dict, Union\nfrom distilabel.steps.tasks import task\n\n\n@task(inputs=[\"input_field\"], outputs=[\"output_field\"])\ndef MyCustomTask(output: Union[str, None], input: Union[Dict[str, Any], None] = None) -> Dict[str, Any]:\n \"\"\"\n ---\n system_prompt: |\n My custom system prompt\n\n user_message_template: |\n My custom user message template: {input_field}\n ---\n \"\"\"\n # Format the `LLM` output here\n return {\"output_field\": output}\n Warning Most Tasks reuse the Task.process method to process the generations, but if a new Task defines a custom process method, like happens for example with Magpie , one hast to deal with the statistics returned by the LLM . "},{"location":"sections/how_to_guides/basic/task/generator_task/","title":"GeneratorTask that produces output","text":""},{"location":"sections/how_to_guides/basic/task/generator_task/#working-with-generatortasks","title":"Working with GeneratorTasks","text":"The GeneratorTask is a custom implementation of a Task based on the GeneratorStep . As with a Task , it is normally used within a Pipeline but can also be used standalone. Warning This task is still experimental and may be subject to changes in the future. from typing import Any, Dict, List, Union\nfrom typing_extensions import override\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import GeneratorOutput\n\n\nclass MyCustomTask(GeneratorTask):\n instruction: str\n\n @override\n def process(self, offset: int = 0) -> GeneratorOutput:\n output = self.llm.generate(\n inputs=[\n [\n {\"role\": \"user\", \"content\": self.instruction},\n ],\n ],\n )\n output = {\"model_name\": self.llm.model_name}\n output.update(\n self.format_output(output=output, input=None)\n )\n yield output\n\n @property\n def outputs(self) -> List[str]:\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n We can then use it as follows: task = MyCustomTask(\n name=\"custom-generation\",\n instruction=\"Tell me a joke.\",\n llm=OpenAILLM(model=\"gpt-4\"),\n)\ntask.load()\n\nnext(task.process())\n# [{'output_field\": \"Why did the scarecrow win an award? Because he was outstanding!\", \"model_name\": \"gpt-4\"}]\n Note Most of the times you would need to override the default process method, as it's suited for the standard Task and not for the GeneratorTask . But within the context of the process function you can freely use the llm to generate data in any way. Note The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution. "},{"location":"sections/how_to_guides/basic/task/generator_task/#defining-custom-generatortasks","title":"Defining custom GeneratorTasks","text":"We can define a custom generator task by creating a new subclass of the GeneratorTask and defining the following: -
process : is a method that generates the data based on the LLM and the instruction provided within the class instance, and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that the inputs argument is not allowed in this function since this is a GeneratorTask . The signature only expects the offset argument, which is used to keep track of the current iteration in the generator. -
outputs : is a property that returns a list of strings with the names of the output fields, this property should always include model_name as one of the outputs since that's automatically injected from the LLM. -
format_output : is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs . Note that there's no need to include the model_name in the output. from typing import Any, Dict, List, Union\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(GeneratorTask):\n @override\n def process(self, offset: int = 0) -> GeneratorOutput:\n output = self.llm.generate(\n inputs=[\n [{\"role\": \"user\", \"content\": \"Tell me a joke.\"}],\n ],\n )\n output = {\"model_name\": self.llm.model_name}\n output.update(\n self.format_output(output=output, input=None)\n )\n yield output\n\n @property\n def outputs(self) -> List[str]:\n return [\"output_field\", \"model_name\"]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any]\n ) -> Dict[str, Any]:\n return {\"output_field\": output}\n "},{"location":"sections/pipeline_samples/","title":"Tutorials","text":" - End-to-end tutorials provide detailed step-by-step explanations and the code used for end-to-end workflows.
- Paper implementations provide reproductions of fundamental papers in the synthetic data domain.
- Examples don't provide explenations but simply show code for different tasks.
"},{"location":"sections/pipeline_samples/#end-to-end-tutorials","title":"End-to-end tutorials","text":" -
Generate a preference dataset Learn about synthetic data generation for ORPO and DPO. Tutorial -
Clean an existing preference dataset Learn about how to provide AI feedback to clean an existing dataset. Tutorial -
Retrieval and reranking models Learn about synthetic data generation for fine-tuning custom retrieval and reranking models. Tutorial -
Generate text classification data Learn about how synthetic data generation for text classification can help address data imbalance or scarcity. Tutorial "},{"location":"sections/pipeline_samples/#paper-implementations","title":"Paper Implementations","text":" -
Deepseek Prover Learn about an approach to generate mathematical proofs for theorems generated from informal math problems. Example -
DEITA Learn about prompt, response tuning for complexity and quality and LLMs as judges for automatic data selection. Paper -
Instruction Backtranslation Learn about automatically labeling human-written text with corresponding instructions. Paper -
Prometheus 2 Learn about using open-source models as judges for direct assessment and pair-wise ranking. Paper -
UltraFeedback Learn about a large-scale, fine-grained, diverse preference dataset, used for training powerful reward and critic models. Paper -
APIGen Learn how to create verifiable high-quality datases for function-calling applications. Paper -
CLAIR Learn Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs. Paper -
Math Shepherd Learn about Math-Shepherd, a framework to generate datasets to train process reward models (PRMs) which assign reward scores to each step of math problem solutions. Paper "},{"location":"sections/pipeline_samples/#examples","title":"Examples","text":" -
Benchmarking with distilabel Learn about reproducing the Arena Hard benchmark with disitlabel. Example -
Structured generation with outlines Learn about generating RPG characters following a pydantic.BaseModel with outlines in distilabel. Example -
Structured generation with instructor Learn about answering instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel. Example -
Create a social network with FinePersonas Learn how to leverage FinePersonas to create a synthetic social network and fine-tune adapters for Multi-LoRA. Example -
Create questions and answers for a exam Learn how to generate questions and answers for a exam, using a raw wikipedia page and structured generation. Example -
Text generation with images in distilabel Ask questions about images using distilabel. Example "},{"location":"sections/pipeline_samples/examples/benchmarking_with_distilabel/","title":"Benchmarking with distilabel ","text":"Benchmark LLMs with distilabel : reproducing the Arena Hard benchmark. The script below first defines both the ArenaHard and the ArenaHardResults tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a Pipeline to run the generation on top of the prompts with InferenceEndpointsLLM while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with OpenAILLM generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie. To run this example you will first need to install the Arena Hard optional dependencies, being pandas , scikit-learn , and numpy . Run python examples/arena_hard.py\n arena_hard.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom typing_extensions import override\n\nfrom distilabel.steps import GlobalStep, StepInput\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import StepOutput\n\n\nclass ArenaHard(Task):\n \"\"\"Evaluates two assistant responses using an LLM as judge.\n\n This `Task` is based on the \"From Live Data to High-Quality Benchmarks: The\n Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n instruction-tuned LLMs that contains 500 challenging user queries. GPT-4 is used\n as the judge to compare the model responses against a baseline model, which defaults\n to `gpt-4-0314`.\n\n Note:\n Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n among popular open-ended LLM benchmarks.\n\n Input columns:\n - instruction (`str`): The instruction to evaluate the responses.\n - generations (`List[str]`): The responses generated by two, and only two, LLMs.\n\n Output columns:\n - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n - score (`str`): The score extracted from the evaluation.\n - model_name (`str`): The model name used to generate the evaluation.\n\n Categories:\n - benchmark\n\n References:\n - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n Examples:\n\n Evaluate two assistant responses for a given instruction using Arean Hard prompts:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import GroupColumns, LoadDataFromDicts\n from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n with Pipeline() as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"What is the capital of France?\"}],\n )\n\n text_generation_a = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n text_generation_b = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine = GroupColumns(\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n llm=..., # LLM instance\n )\n\n load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard\n ```\n \"\"\"\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs required by this task are the `instruction` and the `generations`,\n which are the responses generated by two, and only two, LLMs.\"\"\"\n return [\"instruction\", \"generations\"]\n\n def format_input(self, input: Dict[str, Any]) -> ChatType:\n \"\"\"This method formats the input data as a `ChatType` using the prompt defined\n by the Arena Hard benchmark, which consists on a `system_prompt` plus a template\n for the user first message that contains the `instruction` and both `generations`.\n \"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": \"Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\\n\\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\\n\\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\\n\\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\\n\\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\\n\\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\\n\\n1. Assistant A is significantly better: [[A>>B]]\\n2. Assistant A is slightly better: [[A>B]]\\n3. Tie, relatively the same: [[A=B]]\\n4. Assistant B is slightly better: [[B>A]]\\n5. Assistant B is significantly better: [[B>>A]]\\n\\nExample output: \\\"My final verdict is tie: [[A=B]]\\\".\",\n },\n {\n \"role\": \"user\",\n \"content\": f\"<|User Prompt|>\\n{input['instruction']}\\n\\n<|The Start of Assistant A's Answer|>\\n{input['generations'][0]}\\n<|The End of Assistant A's Answer|>\\n\\n<|The Start of Assistant B's Answer|>\\n{input['generations'][1]}\\n<|The End of Assistant B's Answer|>\",\n },\n ]\n\n @property\n def outputs(self) -> List[str]:\n \"\"\"The outputs generated by this task are the `evaluation`, the `score` and\n the `model_name` (which is automatically injected within the `process` method\n of the parent task).\"\"\"\n return [\"evaluation\", \"score\", \"model_name\"]\n\n def format_output(\n self,\n output: Union[str, None],\n input: Union[Dict[str, Any], None] = None,\n ) -> Dict[str, Any]:\n \"\"\"This method formats the output generated by the LLM as a Python dictionary\n containing the `evaluation` which is the raw output generated by the LLM (consisting\n of the judge LLM alternate generation for the given instruction, plus an explanation\n on the evaluation of the given responses; plus the `score` extracted from the output.\n\n Args:\n output: the raw output of the LLM.\n input: the input to the task. Is provided in case it needs to be used to enrich\n the output if needed.\n\n Returns:\n A dict with the keys `evaluation` with the raw output which contains the LLM\n evaluation and the extracted `score` if possible.\n \"\"\"\n if output is None:\n return {\"evaluation\": None, \"score\": None}\n pattern = re.compile(r\"\\[\\[([AB<>=]+)\\]\\]\")\n match = pattern.search(output)\n if match is None:\n return {\"evaluation\": output, \"score\": None}\n return {\"evaluation\": output, \"score\": match.group(1)}\n\n\nclass ArenaHardResults(GlobalStep):\n \"\"\"Process Arena Hard results to calculate the ELO scores.\n\n This `Step` is based on the \"From Live Data to High-Quality Benchmarks: The\n Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n instruction-tuned LLMs that contains 500 challenging user queries. This step is\n a `GlobalStep` that should run right after the `ArenaHard` task to calculate the\n ELO scores for the evaluated models.\n\n Note:\n Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n among popular open-ended LLM benchmarks.\n\n Input columns:\n - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n - score (`str`): The score extracted from the evaluation.\n\n References:\n - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n Examples:\n\n Rate the ELO scores for two assistant responses for a given an evaluation / comparison between both using Arean Hard prompts:\n\n ```python\n from distilabel.pipeline import Pipeline\n from distilabel.steps import GroupColumns, LoadDataFromDicts\n from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n with Pipeline() as pipeline:\n load_data = LoadDataFromDicts(\n data=[{\"instruction\": \"What is the capital of France?\"}],\n )\n\n text_generation_a = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n text_generation_b = TextGeneration(\n llm=..., # LLM instance\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine = GroupColumns(\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n llm=..., # LLM instance\n )\n\n arena_hard_results = ArenaHardResults(\n custom_model_column=\"generation_models\",\n custom_weights={\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3},\n )\n\n load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard >> arena_hard_results\n ```\n\n \"\"\"\n\n custom_model_column: Optional[str] = None\n custom_weights: Dict[str, int] = {\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3}\n\n def load(self) -> None:\n \"\"\"Ensures that the required dependencies are installed.\"\"\"\n super().load()\n\n try:\n import numpy as np # noqa: F401\n import pandas as pd # noqa: F401\n from sklearn.linear_model import LogisticRegression # noqa: F401\n except ImportError as e:\n raise ImportError(\n \"In order to run `ArenaHardResults`, the `arena-hard` extra dependencies\"\n \" must be installed i.e. `numpy`, `pandas`, and `scikit-learn`.\\n\"\n \"Please install the dependencies by running `pip install distilabel[arena-hard]`.\"\n ) from e\n\n # TODO: the `evaluation` is not really required as an input, so it could be removed, since\n # only `score` is used / required\n @property\n def inputs(self) -> List[str]:\n \"\"\"The inputs required by this step are the `evaluation` and the `score` generated\n by the `ArenaHard` task. Since this step does use the identifiers `model_a` and `model_b`,\n optionally one can set `custom_model_column` to use the model names if existing within\n the input data, ideally this value should be `model_name` if connected from the `ArenaHard`\n step.\"\"\"\n columns = [\"evaluation\", \"score\"]\n if self.custom_model_column:\n columns.append(self.custom_model_column)\n return columns\n\n @override\n def process(self, inputs: StepInput) -> StepOutput: # type: ignore\n \"\"\"This method processes the inputs generated by the `ArenaHard` task to calculate the\n win rates for each of the models to evaluate. Since this step inherits from the `GlobalStep`,\n it will wait for all the input batches to be processed, and then the output will be yielded in\n case there's a follow up step, since this step won't modify the received inputs.\n\n Args:\n inputs: A list of Python dictionaries with the inputs of the task.\n\n Yields:\n A list of Python dictionaries with the outputs of the task.\n\n References:\n - https://github.com/lm-sys/arena-hard-auto/blob/main/show_result.py\n \"\"\"\n import numpy as np\n import pandas as pd\n from sklearn.linear_model import LogisticRegression\n\n models = [\"A\", \"B\"]\n if self.custom_model_column:\n models = inputs[0][self.custom_model_column]\n\n # TODO: the battles are only calculated for the first game, even though the official\n # implementation also covers the possibility of a second game (not within the released\n # dataset yet)\n battles = pd.DataFrame()\n for input in inputs:\n output = {\n # TODO: \"question_id\": input[\"question_id\"],\n \"model_a\": models[0],\n \"model_b\": models[1],\n }\n if input[\"score\"] in [\"A>B\", \"A>>B\"]:\n output[\"winner\"] = models[0]\n rows = [output] * self.custom_weights[input[\"score\"]]\n elif input[\"score\"] in [\"B>A\", \"B>>A\"]:\n output[\"winner\"] = models[1]\n rows = [output] * self.custom_weights[input[\"score\"]]\n elif input[\"score\"] == \"A=B\":\n output[\"winner\"] = \"tie\"\n rows = [output]\n else:\n continue\n\n battles = pd.concat([battles, pd.DataFrame(rows)])\n\n models = pd.concat([battles[\"model_a\"], battles[\"model_b\"]]).unique()\n models = pd.Series(np.arange(len(models)), index=models)\n\n battles = pd.concat([battles, battles], ignore_index=True)\n p = len(models.index)\n n = battles.shape[0]\n\n X = np.zeros([n, p])\n X[np.arange(n), models[battles[\"model_a\"]]] = +np.log(10)\n X[np.arange(n), models[battles[\"model_b\"]]] = -np.log(10)\n\n Y = np.zeros(n)\n Y[battles[\"winner\"] == \"model_a\"] = 1.0\n\n tie_idx = battles[\"winner\"] == \"tie\"\n tie_idx[len(tie_idx) // 2 :] = False\n Y[tie_idx] = 1.0\n\n lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8) # type: ignore\n lr.fit(X, Y)\n\n # The ELO scores are calculated assuming that the reference is `gpt-4-0314`\n # with an starting ELO of 1000, so that the evaluated models are compared with\n # `gtp-4-0314` only if it's available within the models\n elo_scores = 400 * lr.coef_[0] + 1000\n # TODO: we could parametrize the reference / anchor model, but left as is to be faithful to the\n # original implementation\n if \"gpt-4-0314\" in models.index:\n elo_scores += 1000 - elo_scores[models[\"gpt-4-0314\"]]\n\n output = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)\n self._logger.info(f\"Arena Hard ELO: {output}\")\n\n # Here only so that if follow up steps are connected the inputs are preserved,\n # since this step doesn't modify nor generate new inputs\n yield inputs\n\n\nif __name__ == \"__main__\":\n import json\n\n from distilabel.models import InferenceEndpointsLLM, OpenAILLM\n from distilabel.pipeline import Pipeline\n from distilabel.steps import (\n GroupColumns,\n KeepColumns,\n LoadDataFromHub,\n StepInput,\n step,\n )\n from distilabel.steps.tasks import TextGeneration\n from distilabel.steps.typing import StepOutput\n\n @step(inputs=[\"turns\"], outputs=[\"system_prompt\", \"instruction\"])\n def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:\n for input in inputs:\n for item in input:\n item[\"system_prompt\"] = \"You are a helpful assistant.\"\n item[\"instruction\"] = item[\"turns\"][0][\"content\"]\n yield input\n\n @step(\n inputs=[\"question_id\"],\n outputs=[\"generation\", \"generation_model\"],\n step_type=\"global\",\n )\n def LoadReference(*inputs: StepInput) -> StepOutput:\n # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl\n lines = open(\"gpt-4-0314.jsonl\", mode=\"r\").readlines()\n for input in inputs:\n for item in input:\n for line in lines:\n data = json.loads(line)\n if data[\"question_id\"] == item[\"question_id\"]:\n item[\"generation\"] = data[\"choices\"][0][\"turns\"][0][\"content\"]\n item[\"generation_model\"] = data[\"model_id\"]\n break\n yield input\n\n with Pipeline(name=\"arena-hard-v0.1\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n repo_id=\"alvarobartt/lmsys-arena-hard-v0.1\",\n split=\"test\",\n num_examples=5,\n )\n\n load_reference = LoadReference(name=\"load_reference\")\n\n prepare = PrepareForTextGeneration(name=\"prepare\")\n\n text_generation_cohere = TextGeneration(\n name=\"text_generation_cohere\",\n llm=InferenceEndpointsLLM(\n model_id=\"CohereForAI/c4ai-command-r-plus\",\n tokenizer_id=\"CohereForAI/c4ai-command-r-plus\",\n ),\n use_system_prompt=True,\n input_batch_size=10,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine_columns = GroupColumns(\n name=\"combine_columns\",\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n )\n\n arena_hard = ArenaHard(\n name=\"arena_hard\",\n llm=OpenAILLM(model=\"gpt-4-1106-preview\"),\n output_mappings={\"model_name\": \"evaluation_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"question_id\",\n \"category\",\n \"cluster\",\n \"system_prompt\",\n \"instruction\",\n \"generations\",\n \"generation_models\",\n \"evaluation\",\n \"score\",\n \"evaluation_model\",\n ],\n )\n\n win_rates = ArenaHardResults(\n name=\"win_rates\", custom_model_column=\"generation_models\"\n )\n\n load_dataset >> load_reference # type: ignore\n load_dataset >> prepare >> text_generation_cohere # type: ignore\n ( # type: ignore\n [load_reference, text_generation_cohere]\n >> combine_columns\n >> arena_hard\n >> keep_columns\n >> win_rates\n )\n\n distiset = pipeline.run(\n parameters={ # type: ignore\n text_generation_cohere.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n \"stop_sequences\": [\"<EOS_TOKEN>\", \"<|END_OF_TURN_TOKEN|>\"],\n }\n }\n },\n arena_hard.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.0,\n \"max_new_tokens\": 4096,\n }\n }\n },\n },\n )\n if distiset is not None:\n distiset.push_to_hub(\"arena-hard-results\")\n "},{"location":"sections/pipeline_samples/examples/exam_questions/","title":"Create exam questions using structured generation","text":"This example will showcase how to generate exams questions and answers from a text page. In this case, we will use a wikipedia page as an example, and show how to leverage the prompt to help the model generate the data in the appropriate format. We are going to use a meta-llama/Meta-Llama-3.1-8B-Instruct to generate questions and answers for a mock exam from a wikipedia page. In this case, we are going to use the Transfer Learning entry for it. With the help of structured generation we will guide the model to create structured data for us that is easy to parse. The structure will be question, answer, and distractors (wrong answers). Click to see the sample results Example page Transfer_learning: QA of the page: {\n \"exam\": [\n {\n \"answer\": \"A technique in machine learning where knowledge learned from a task is re-used to boost performance on a related task.\",\n \"distractors\": [\"A type of neural network architecture\", \"A machine learning algorithm for image classification\", \"A method for data preprocessing\"],\n \"question\": \"What is transfer learning?\"\n },\n {\n \"answer\": \"1976\",\n \"distractors\": [\"1981\", \"1992\", \"1998\"],\n \"question\": \"In which year did Bozinovski and Fulgosi publish a paper addressing transfer learning in neural network training?\"\n },\n {\n \"answer\": \"Discriminability-based transfer (DBT) algorithm\",\n \"distractors\": [\"Multi-task learning\", \"Learning to Learn\", \"Cost-sensitive machine learning\"],\n \"question\": \"What algorithm was formulated by Lorien Pratt in 1992?\"\n },\n {\n \"answer\": \"A domain consists of a feature space and a marginal probability distribution.\",\n \"distractors\": [\"A domain consists of a label space and an objective predictive function.\", \"A domain consists of a task and a learning algorithm.\", \"A domain consists of a dataset and a model.\"],\n \"question\": \"What is the definition of a domain in the context of transfer learning?\"\n },\n {\n \"answer\": \"Transfer learning aims to help improve the learning of the target predictive function in the target domain using the knowledge in the source domain and learning task.\",\n \"distractors\": [\"Transfer learning aims to learn a new task from scratch.\", \"Transfer learning aims to improve the learning of the source predictive function in the source domain.\", \"Transfer learning aims to improve the learning of the target predictive function in the source domain.\"],\n \"question\": \"What is the goal of transfer learning?\"\n },\n {\n \"answer\": \"Markov logic networks, Bayesian networks, cancer subtype discovery, building utilization, general game playing, text classification, digit recognition, medical imaging, and spam filtering.\",\n \"distractors\": [\"Supervised learning, unsupervised learning, reinforcement learning, natural language processing, computer vision, and robotics.\", \"Image classification, object detection, segmentation, and tracking.\", \"Speech recognition, sentiment analysis, and topic modeling.\"],\n \"question\": \"What are some applications of transfer learning?\"\n },\n {\n \"answer\": \"ADAPT (Python), TLib (Python), Domain-Adaptation-Toolbox (Matlab)\",\n \"distractors\": [\"TensorFlow, PyTorch, Keras\", \"Scikit-learn, OpenCV, NumPy\", \"Matlab, R, Julia\"],\n \"question\": \"What are some software implementations of transfer learning and domain adaptation algorithms?\"\n }\n ]\n}\n "},{"location":"sections/pipeline_samples/examples/exam_questions/#build-the-pipeline","title":"Build the pipeline","text":"Let's see how to build a pipeline to obtain this type of data: from typing import List\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nimport wikipedia\n\npage = wikipedia.page(title=\"Transfer_learning\") # (1)\n\n\nclass ExamQuestion(BaseModel):\n question: str = Field(..., description=\"The question to be answered\")\n answer: str = Field(..., description=\"The correct answer to the question\")\n distractors: List[str] = Field(\n ..., description=\"A list of incorrect but viable answers to the question\"\n )\n\nclass ExamQuestions(BaseModel): # (2)\n exam: List[ExamQuestion]\n\n\nSYSTEM_PROMPT = \"\"\"\\\nYou are an exam writer specialized in writing exams for students.\nYour goal is to create questions and answers based on the document provided, and a list of distractors, that are incorrect but viable answers to the question.\nYour answer must adhere to the following format:\n```\n[\n {\n \"question\": \"Your question\",\n \"answer\": \"The correct answer to the question\",\n \"distractors\": [\"wrong answer 1\", \"wrong answer 2\", \"wrong answer 3\"]\n },\n ... (more questions and answers as required)\n]\n```\n\"\"\".strip() #\u00a0(3)\n\n\nwith Pipeline(name=\"ExamGenerator\") as pipeline:\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"page\": page.content, #\u00a0(4)\n }\n ],\n )\n\n text_generation = TextGeneration( # (5)\n name=\"exam_generation\",\n system_prompt=SYSTEM_PROMPT,\n template=\"Generate a list of answers and questions about the document. Document:\\n\\n{{ page }}\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\n \"schema\": ExamQuestions.model_json_schema(),\n \"format\": \"json\"\n },\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n load_dataset >> text_generation # (6)\n -
Download a single page for the demo. We could donwnload first the pages, or apply the same procedure to any type of data we want. In a real world use case, we would want to make a dataset from these documents first. -
Define the structure required for the answer using Pydantic. In this case we want for each page, a list with questions and answers (additionally we've added distractors, but can be ignored for this case). So our output will be a ExamQuestions model, which is a list of ExamQuestion , where each one consists in the question and answer fields as string fields. The language model will use the field descriptions to generate the values. -
Use the system prompt to guide the model towards the behaviour we want from it. Independently from the structured output we are forcing the model to have, it helps if we pass the format expected in our prompt. -
Move the page content from wikipedia to a row in the dataset. -
The TextGeneration task gets the system prompt, and the user prompt by means of the template argument, where we aid the model to generate the questions and answers based on the page content, that will be obtained from the corresponding column of the loaded data. -
Connect both steps, and we are done. "},{"location":"sections/pipeline_samples/examples/exam_questions/#run-the-example","title":"Run the example","text":"To run this example you will first need to install the wikipedia dependency to download the sample data, being pip install wikipedia . Change the username first in case you want to push the dataset to the hub using your account. Run python examples/exam_questions.py\n exam_questions.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nimport wikipedia\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\npage = wikipedia.page(title=\"Transfer_learning\")\n\n\nclass ExamQuestion(BaseModel):\n question: str = Field(..., description=\"The question to be answered\")\n answer: str = Field(..., description=\"The correct answer to the question\")\n distractors: List[str] = Field(\n ..., description=\"A list of incorrect but viable answers to the question\"\n )\n\n\nclass ExamQuestions(BaseModel):\n exam: List[ExamQuestion]\n\n\nSYSTEM_PROMPT = \"\"\"\\\nYou are an exam writer specialized in writing exams for students.\nYour goal is to create questions and answers based on the document provided, and a list of distractors, that are incorrect but viable answers to the question.\nYour answer must adhere to the following format:\n```\n[\n {\n \"question\": \"Your question\",\n \"answer\": \"The correct answer to the question\",\n \"distractors\": [\"wrong answer 1\", \"wrong answer 2\", \"wrong answer 3\"]\n },\n ... (more questions and answers as required)\n]\n```\n\"\"\".strip()\n\n\nwith Pipeline(name=\"ExamGenerator\") as pipeline:\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"page\": page.content,\n }\n ],\n )\n\n text_generation = TextGeneration(\n name=\"exam_generation\",\n system_prompt=SYSTEM_PROMPT,\n template=\"Generate a list of answers and questions about the document. Document:\\n\\n{{ page }}\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n structured_output={\n \"schema\": ExamQuestions.model_json_schema(),\n \"format\": \"json\",\n },\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 2048,\n }\n }\n }\n },\n use_cache=False,\n )\n distiset.push_to_hub(\"USERNAME/exam_questions\")\n "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/","title":"Create a social network with FinePersonas","text":"In this example, we'll explore the creation of specialized user personas for social network interactions using the FinePersonas-v0.1 dataset from Hugging Face. The final dataset will be ready to fine-tune a chat model with specific traits and characteristics. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#introduction","title":"Introduction","text":"We'll delve into the process of fine-tuning different LoRA (Low-Rank Adaptation) models to imbue these personas with specific traits and characteristics. This approach draws inspiration from Michael Sayman's work on SocialAI (visit the profile to see some examples), to leverage FinePersonas-v0.1 for building models that can emulate bots with specific behaviour. By fine-tuning these adapters, we can potentially create AI personas with distinct characteristics, communication styles, and areas of expertise. The result? AI interactions that feel more natural and tailored to specific contexts or user needs. For those interested in the technical aspects of this approach, we recommend the insightful blog post on Multi-LoRA serving. It provides a clear and comprehensive explanation of the technology behind this innovative method. Let's jump to the demo. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#creating-our-socialai-task","title":"Creating our SocialAI Task","text":"Building on the new TextGeneration , creating custom tasks is easier than ever before. This powerful tool opens up a world of possibilities for creating tailored text-based content with ease and precision. We will create a SocialAI task that will be in charge of generating responses to user interactions, taking into account a given follower_type , and use the perspective from a given persona : from distilabel.steps.tasks import TextGeneration\n\nclass SocialAI(TextGeneration):\n follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n system_prompt: str = (\n \"You are an AI assistant expert at simulating user interactions. \"\n \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n \"Here are some traits to use for your personality:\\n\\n\"\n \"{traits}\"\n ) #\u00a0(1)\n template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\" # (2)\n columns: str | list[str] = [\"persona\", \"post\"] # (3)\n\n _follower_traits: dict[str, str] = {\n \"supporter\": (\n \"- Encouraging and positive\\n\"\n \"- Tends to prioritize enjoyment and relaxation\\n\"\n \"- Focuses on the present moment and short-term pleasure\\n\"\n \"- Often uses humor and playful language\\n\"\n \"- Wants to help others feel good and have fun\\n\"\n ),\n \"troll\": (\n \"- Provocative and confrontational\\n\"\n \"- Enjoys stirring up controversy and conflict\\n\"\n \"- Often uses sarcasm, irony, and mocking language\\n\"\n \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n \"- Seeks to get a rise out of others and create drama\\n\"\n ),\n \"alarmist\": (\n \"- Anxious and warning-oriented\\n\"\n \"- Focuses on potential risks and negative consequences\\n\"\n \"- Often uses dramatic or sensational language\\n\"\n \"- Tends to be serious and stern in tone\\n\"\n \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n ),\n }\n\n def load(self) -> None:\n super().load()\n self.system_prompt = self.system_prompt.format(\n follower_type=self.follower_type,\n traits=self._follower_traits[self.follower_type]\n ) # (4)\n -
We have a custom system prompt that will depend on the follower_type we decide for our model. -
The base template or prompt will answert to the post we have, from the point of view of a persona . -
We will need our dataset to have both persona and post columns to populate the prompt. -
In the load method we place the specific traits for our follower type in the system prompt. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#data-preparation","title":"Data preparation","text":"This is an example, so let's keep it short. We will use 3 posts, and 3 different types of personas. While there's potential to enhance this process (perhaps by implementing random persona selection or leveraging semantic similarity) we'll opt for a straightforward method in this demonstration. Our goal is to create a set of nine examples, each pairing a post with a persona. To achieve this, we'll employ an LLM to respond to each post from the perspective of a specific persona , effectively simulating how different characters might engage with the content. posts = [\n {\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n },\n {\n \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n },\n {\n \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n },\n]\n\npersonas = (\n load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n .shuffle()\n .select(range(3))\n .select_columns(\"persona\")\n .to_list()\n)\n\ndata = []\nfor post in posts:\n for persona in personas:\n data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n Each row in will have the following format: import json\nprint(json.dumps(data[0], indent=4))\n{\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n \"persona\": \"A high school or college environmental science teacher or an ecology student specializing in biogeography and ecosystem dynamics.\"\n}\n This will be our dataset, that we can ingest using the LoadDataFromDicts : loader = LoadDataFromDicts(data=data)\n "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#simulating-from-different-types-of-followers","title":"Simulating from different types of followers","text":"With our data in hand, we're ready to explore the capabilities of our SocialAI task. For this demonstration, we'll make use of of meta-llama/Meta-Llama-3.1-70B-Instruct While this model has become something of a go-to choice recently, it's worth noting that experimenting with a variety of models could yield even more interesting results: from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n)\nfollower_type = \"supporter\"\n\nfollower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\",\n)\n This setup simplifies the process, we only need to input the follower type, and the system handles the rest. We could update this too to have a random type of follower by default, and simulate from a bunch of different personalities. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#building-our-pipeline","title":"Building our Pipeline","text":"The foundation of our pipeline is now in place. At its core is a single, powerful LLM. This versatile model will be repurposed to drive three distinct SocialAI Tasks, each tailored to a specific TextGeneration task, and each one of them will be prepared for Supervised Fine Tuning using FormatTextGenerationSFT : with Pipeline(name=\"Social AI Personas\") as pipeline:\n loader = LoadDataFromDicts(data=data, batch_size=1)\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n )\n\n for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n follower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\", # (1)\n output_mappings={\n \"generation\": f\"interaction_{follower_type}\" # (2)\n }\n )\n format_sft = FormatTextGenerationSFT(\n name=f\"format_sft_{follower_type}\",\n input_mappings={\n \"instruction\": \"post\",\n \"generation\": f\"interaction_{follower_type}\" # (3)\n },\n )\n loader >> follower >> format_sft # (4)\n -
We update the name of the step to keep track in the pipeline. -
The generation column from each LLM will be mapped to avoid them being overriden, as we are reusing the same task. -
As we have modified the output column from SocialAI , we redirect each one of the \"follower_type\" responses. -
Connect the loader to each one of the follower tasks and format_sft to obtain 3 different subsets. The outcome of this pipeline will be three specialized models, each fine-tuned to a unique follower type crafted by the SocialAI task. These models will generate SFT-formatted datasets, where each post is paired with its corresponding interaction data for a specific follower type. This setup enables seamless fine-tuning using your preferred framework, such as TRL, or any other training framework of your choice. "},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#script-and-final-dataset","title":"Script and final dataset","text":"All the pieces are in place for our script, the full pipeline can be seen here: Run python examples/finepersonas_social_ai.py\n finepersonas_social_ai.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Literal\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import FormatTextGenerationSFT, LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass SocialAI(TextGeneration):\n follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n system_prompt: str = (\n \"You are an AI assistant expert at simulating user interactions. \"\n \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n \"Here are some traits to use for your personality:\\n\\n\"\n \"{traits}\"\n )\n template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"\n columns: str | list[str] = [\"persona\", \"post\"]\n\n _follower_traits: dict[str, str] = {\n \"supporter\": (\n \"- Encouraging and positive\\n\"\n \"- Tends to prioritize enjoyment and relaxation\\n\"\n \"- Focuses on the present moment and short-term pleasure\\n\"\n \"- Often uses humor and playful language\\n\"\n \"- Wants to help others feel good and have fun\\n\"\n ),\n \"troll\": (\n \"- Provocative and confrontational\\n\"\n \"- Enjoys stirring up controversy and conflict\\n\"\n \"- Often uses sarcasm, irony, and mocking language\\n\"\n \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n \"- Seeks to get a rise out of others and create drama\\n\"\n ),\n \"alarmist\": (\n \"- Anxious and warning-oriented\\n\"\n \"- Focuses on potential risks and negative consequences\\n\"\n \"- Often uses dramatic or sensational language\\n\"\n \"- Tends to be serious and stern in tone\\n\"\n \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n ),\n }\n\n def load(self) -> None:\n super().load()\n self.system_prompt = self.system_prompt.format(\n follower_type=self.follower_type,\n traits=self._follower_traits[self.follower_type],\n )\n\n\nposts = [\n {\n \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n },\n {\n \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n },\n {\n \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n },\n]\n\npersonas = (\n load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n .shuffle()\n .select(range(3))\n .select_columns(\"persona\")\n .to_list()\n)\n\ndata = []\nfor post in posts:\n for persona in personas:\n data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n\n\nwith Pipeline(name=\"Social AI Personas\") as pipeline:\n loader = LoadDataFromDicts(data=data, batch_size=1)\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 256,\n },\n )\n\n for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n follower = SocialAI(\n llm=llm,\n follower_type=follower_type,\n name=f\"{follower_type}_user\",\n output_mappings={\"generation\": f\"interaction_{follower_type}\"},\n )\n format_sft = FormatTextGenerationSFT(\n name=f\"format_sft_{follower_type}\",\n input_mappings={\n \"instruction\": \"post\",\n \"generation\": f\"interaction_{follower_type}\",\n },\n )\n loader >> follower >> format_sft\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n distiset.push_to_hub(\"plaguss/FinePersonas-SocialAI-test\", include_script=True)\n This is the final toy dataset we obtain: FinePersonas-SocialAI-test You can see examples of how to load each subset of them to fine-tune a model: from datasets import load_dataset\n\nds = load_dataset(\"plaguss/FinePersonas-SocialAI-test\", \"format_sft_troll\")\n And a sample of the generated field with the corresponding post and persona : {\n \"post\": \"Hmm, ok now I\\u0027m torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n \"persona\": \"A high school or undergraduate physics or chemistry teacher, likely with a focus on experimental instruction.\",\n \"interaction_troll\": \"\\\"Late night cravings? More like late night brain drain. Either way, it\\u0027s just a collision of molecules in your stomach. Choose the one with more calories, at least that\\u0027s some decent kinetic energy.\\\"\",\n}\n There's a lot of room for improvement, but quite a promising start. "},{"location":"sections/pipeline_samples/examples/llama_cpp_with_outlines/","title":"Structured generation with outlines ","text":"Generate RPG characters following a pydantic.BaseModel with outlines in distilabel . This script makes use of LlamaCppLLM and the structured output capabilities thanks to outlines to generate RPG characters that adhere to a JSON schema. It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other LLMs like vLLM . Run python examples/structured_generation_with_outlines.py\n structured_generation_with_outlines.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom enum import Enum\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, StringConstraints, conint\nfrom typing_extensions import Annotated\n\nfrom distilabel.models import LlamaCppLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Weapon(str, Enum):\n sword = \"sword\"\n axe = \"axe\"\n mace = \"mace\"\n spear = \"spear\"\n bow = \"bow\"\n crossbow = \"crossbow\"\n\n\nclass Armor(str, Enum):\n leather = \"leather\"\n chainmail = \"chainmail\"\n plate = \"plate\"\n mithril = \"mithril\"\n\n\nclass Character(BaseModel):\n name: Annotated[str, StringConstraints(max_length=30)]\n age: conint(gt=1, lt=3000)\n armor: Armor\n weapon: Weapon\n\n\n# Download the model with\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nwith Pipeline(\"RPG-characters\") as pipeline:\n system_prompt = (\n \"You are a leading role play gamer. You have seen thousands of different characters and their attributes.\"\n \" Please return a JSON object with common attributes of an RPG character.\"\n )\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"system_prompt\": system_prompt,\n \"instruction\": f\"Give me a character description for a {char}\",\n }\n for char in [\"dwarf\", \"elf\", \"human\", \"ork\"]\n ],\n )\n llm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n )\n # Change to vLLM as such:\n # llm = vLLM(\n # model=\"teknium/OpenHermes-2.5-Mistral-7B\",\n # extra_kwargs={\"tensor_parallel_size\": 1},\n # structured_output={\"format\": \"json\", \"schema\": Character},\n # )\n\n text_generation = TextGeneration(\n name=\"text_generation_rpg\",\n llm=llm,\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 256}}\n }\n },\n use_cache=False,\n )\n for num, character in enumerate(distiset[\"default\"][\"train\"][\"generation\"]):\n print(f\"Character: {num}\")\n print(character)\n\n# Character: 0\n# {\n# \"name\": \"Gimli\",\n# \"age\": 42,\n# \"armor\": \"plate\",\n# \"weapon\": \"axe\" }\n# Character: 1\n# {\"name\":\"Gaelen\",\"age\":600,\"armor\":\"leather\",\"weapon\":\"bow\"}\n# Character: 2\n# {\"name\": \"John Smith\",\"age\": 35,\"armor\": \"leather\",\"weapon\": \"sword\"}\n# Character: 3\n# { \"name\": \"Grug\", \"age\": 35, \"armor\": \"leather\", \"weapon\": \"axe\"}\n "},{"location":"sections/pipeline_samples/examples/mistralai_with_instructor/","title":"Structured generation with instructor ","text":"Answer instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel . This script makes use of MistralLLM and the structured output capabilities thanks to instructor to generate knowledge graphs from complex topics. This example is translated from this awesome example from instructor cookbook. Run python examples/structured_generation_with_instructor.py\n structured_generation_with_instructor.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.models import MistralLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Node(BaseModel):\n id: int\n label: str\n color: str\n\n\nclass Edge(BaseModel):\n source: int\n target: int\n label: str\n color: str = \"black\"\n\n\nclass KnowledgeGraph(BaseModel):\n nodes: List[Node] = Field(..., default_factory=list)\n edges: List[Edge] = Field(..., default_factory=list)\n\n\nwith Pipeline(\n name=\"Knowledge-Graphs\",\n description=(\n \"Generate knowledge graphs to answer questions, this type of dataset can be used to \"\n \"steer a model to answer questions with a knowledge graph.\"\n ),\n) as pipeline:\n sample_questions = [\n \"Teach me about quantum mechanics\",\n \"Who is who in The Simpsons family?\",\n \"Tell me about the evolution of programming languages\",\n ]\n\n load_dataset = LoadDataFromDicts(\n name=\"load_instructions\",\n data=[\n {\n \"system_prompt\": \"You are a knowledge graph expert generator. Help me understand by describing everything as a detailed knowledge graph.\",\n \"instruction\": f\"{question}\",\n }\n for question in sample_questions\n ],\n )\n\n text_generation = TextGeneration(\n name=\"knowledge_graph_generation\",\n llm=MistralLLM(\n model=\"open-mixtral-8x22b\", structured_output={\"schema\": KnowledgeGraph}\n ),\n input_batch_size=8,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n text_generation.name: {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 2048}}\n }\n },\n use_cache=False,\n )\n\n distiset.push_to_hub(\"distilabel-internal-testing/knowledge_graphs\")\n Visualizing the graphs Want to see how to visualize the graphs? You can test it using the following script. Generate some samples on your own and take a look: Note This example uses graphviz to render the graph, you can install with pip in the following way: pip install graphviz\n python examples/draw_kg.py 2 # You can pass 0,1,2 to visualize each of the samples.\n "},{"location":"sections/pipeline_samples/examples/text_generation_with_image/","title":"Text generation with images in distilabel ","text":"Answer questions about images using distilabel . Image-text-to-text models take in an image and text prompt and output text. In this example we will use an LLM InferenceEndpointsLLM with meta-llama/Llama-3.2-11B-Vision-Instruct to ask a question about an image, and OpenAILLM with gpt-4o-mini . We will ask a simple question to showcase how the TextGenerationWithImage task can be used in a pipeline. Inference Endpoints - meta-llama/Llama-3.2-11B-Vision-InstructOpenAI - gpt-4o-mini from distilabel.models.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\nfrom distilabel.steps import LoadDataFromDicts\n\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ],\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n )\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=llm,\n image_type=\"url\" # (1)\n )\n\n loader >> vision\n - The image_type can be a url pointing to the image, the base64 string representation, or a PIL image, take a look at the
TextGenerationWithImage for more information. Image: Question: What\u2019s in this image? Response: This image depicts a wooden boardwalk weaving its way through a lush meadow, flanked by vibrant green grass that stretches towards the horizon under a calm and inviting sky. The boardwalk runs straight ahead, away from the viewer, forming a clear pathway through the tall, lush green grass, crops or other plant types or an assortment of small trees and shrubs. This meadow is dotted with trees and shrubs, appearing to be healthy and green. The sky above is a beautiful blue with white clouds scattered throughout, adding a sense of tranquility to the scene. While this image appears to be of a natural landscape, because grass is... from distilabel.models.llms import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\nfrom distilabel.steps import LoadDataFromDicts\n\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ],\n )\n\n llm = OpenAILLM(\n model=\"gpt-4o-mini\",\n )\n\n vision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=llm,\n image_type=\"url\" # (1)\n )\n\n loader >> vision\n - The image_type can be a url pointing to the image, the base64 string representation, or a PIL image, take a look at the
VisionGeneration for more information. Image: Question: What\u2019s in this image? Response: The image depicts a scenic landscape featuring a wooden walkway or path that runs through a lush green marsh or field. The area is surrounded by tall grass and various shrubs, with trees likely visible in the background. The sky is blue with some wispy clouds, suggesting a beautiful day. Overall, it presents a peaceful natural setting, ideal for a stroll or nature observation. The full pipeline can be run at the following example: Run the full pipeline python examples/text_generation_with_image.py\n text_generation_with_image.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom distilabel.models.llms import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage\n\nwith Pipeline(name=\"vision_generation_pipeline\") as pipeline:\n loader = LoadDataFromDicts(\n data=[\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n }\n ],\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n )\n\n vision = TextGenerationWithImage(name=\"vision_gen\", llm=llm, image_type=\"url\")\n\n loader >> vision\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n distiset.push_to_hub(\"plaguss/test-vision-generation-Llama-3.2-11B-Vision-Instruct\")\n A sample dataset can be seen at plaguss/test-vision-generation-Llama-3.2-11B-Vision-Instruct. "},{"location":"sections/pipeline_samples/papers/apigen/","title":"Create Function-Calling datasets with APIGen","text":"This example will introduce APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets, a data generation pipeline designed to synthesize verifiable high-quality datasets for function-calling applications. "},{"location":"sections/pipeline_samples/papers/apigen/#replication","title":"Replication","text":"The following figure showcases the APIGen framework: Now, let's walk through the key steps illustrated in the figure: -
DataSampler : With the help of this step and the original Salesforce/xlam-function-calling-60k we are getting the Seed QA Data Sampler for the prompt template. -
APIGenGenerator : This step does the job of the Query-Answer Generator, including the format checker from Stage 1: Format Checker thanks to the structured output generation. -
APIGenExecutionChecker : This step is in charge of the Stage 2: Execution Checker. -
APIGenSemanticChecker : Step in charge of running Stage 3: Semantic Checker, can use the same or a different LLM, we are using the same as in APIGenGenerator step. The current implementation hasn't utilized the Diverse Prompt Library. To incorporate it, one could either adjust the prompt template within the APIGenGenerator or develop a new sampler specifically for this purpose. As for the API Sampler, while no specific data is shared here, we've created illustrative examples to demonstrate the pipeline's functionality. These examples represent a mix of data that could be used to replicate the sampler's output. "},{"location":"sections/pipeline_samples/papers/apigen/#data-preparation","title":"Data preparation","text":"The original paper tells about the data they used and give some hints, but nothing was shared. In this example, we will write a bunch of examples by hand to showcase how this pipeline can be built. Assume we have the following function names, and corresponding descriptions of their behaviour: data = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n {\n \"func_name\": \"binary_addition\",\n \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n },\n {\n \"func_name\": \"swapi_planet_resource\",\n \"func_desc\": \"get a specific planets resource\",\n },\n {\n \"func_name\": \"disney_character\",\n \"func_desc\": \"Find a specific character using this endpoint\",\n }\n]\n The original paper refers to both python functions and APIs, but we will make use of python functions exclusively for simplicity. In order to execute and check this functions/APIs, we need access to the code, which we have moved to a Python file: lib_apigen.py. All this functions are executable, but we also need access to their tool representation. For this, we will make use of transformers' get_json_schema function1. We have all the machinery prepared in our libpath, except from the tool definition. With the help of our helper function load_module_from_path we will load this python module, collect all the tools, and add them to each row in our data variable. from distilabel.steps.tasks.apigen.utils import load_module_from_path\n\nlibpath_module = load_module_from_path(libpath)\ntools = getattr(libpath_module, \"get_tools\")() # call get_tools()\n\nfor row in data:\n #\u00a0The tools should have a mix where both the correct and irrelevant tools are present.\n row.update({\"tools\": [tools[row[\"func_name\"]]]})\n Now we have all the necessary data for our prompt. Additionally, we will make use of the original dataset as few-shot examples to enhance the model: ds_og = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\n We have just loaded a subset and transformed it to a list of dictionaries, as we will use it in the DataSampler GeneratorStep , grabbing random examples from the original dataset. "},{"location":"sections/pipeline_samples/papers/apigen/#building-the-pipeline","title":"Building the Pipeline","text":"Now that we've walked through each component, it's time to see how it all comes together, here's the Pipeline code: with Pipeline(name=\"apigen-example\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data) # (1)\n\n sampler = DataSampler( # (2)\n data=ds_og,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n\n prep_examples = PrepareExamples() # This step will add the 'examples' column\n\n combine_steps = CombineOutputs() # (3)\n\n model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n llm=InferenceEndpointsLLM( # (4)\n model_id=model_id,\n tokenizer_id=model_id,\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n apigen = APIGenGenerator( # (5)\n llm=llm,\n use_default_structured_output=True,\n )\n\n execution_checker = APIGenExecutionChecker(libpath=str(libpath)) # (6)\n semantic_checker = APIGenSemanticChecker(llm=llm) # (7)\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples] \n >> combine_steps \n >> apigen\n >> execution_checker\n >> semantic_checker\n )\n -
Load the data seeds we are going to use to generate our function calling dataset. -
The DataSampler together with PrepareExamples will be used to help us create the few-shot examples from the original dataset to be fed in our prompt. -
Combine both columns to obtain a single stream of data -
Will reuse the same LLM for the generation and the semantic checks. -
Creates the query and answers that will be used together with the tools to fine-tune a new model. Will generate the structured outputs to ensure we have valid JSON formatted answers. -
Adds columns keep_row_after_execution_check and execution_result . -
Adds columns keep_row_after_semantic_check and thought . "},{"location":"sections/pipeline_samples/papers/apigen/#script-and-final-dataset","title":"Script and final dataset","text":"To see all the pieces in place, take a look at the full pipeline, as well as an example row that would be generated from this pipeline. Run python examples/pipeline_apigen.py\n pipeline_apigen.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom pathlib import Path\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, DataSampler, LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n APIGenExecutionChecker,\n APIGenGenerator,\n APIGenSemanticChecker,\n)\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples, load_module_from_path\n\nlibpath = Path(__file__).parent / \"lib_apigen.py\"\n\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n {\n \"func_name\": \"binary_addition\",\n \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n },\n {\n \"func_name\": \"swapi_planet_resource\",\n \"func_desc\": \"get a specific planets resource\",\n },\n {\n \"func_name\": \"disney_character\",\n \"func_desc\": \"Find a specific character using this endpoint\",\n },\n]\n\nlibpath_module = load_module_from_path(libpath)\ntools = libpath_module.get_tools() # call get_tools()\n\n# TODO: Add in the tools between 0 and 2 extra tools to make the task more challenging.\nfor row in data:\n # The tools should have a mix where both the correct and irrelevant tools are present.\n row.update({\"tools\": [tools[row[\"func_name\"]]]})\n\n\nds_og = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\n\n\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds_og,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n\n prep_examples = PrepareExamples()\n\n model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n llm = InferenceEndpointsLLM(\n model_id=model_id,\n tokenizer_id=model_id,\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n )\n apigen = APIGenGenerator(\n llm=llm,\n use_default_structured_output=True,\n )\n combine_steps = CombineOutputs()\n\n execution_checker = APIGenExecutionChecker(libpath=str(libpath))\n semantic_checker = APIGenSemanticChecker(llm=llm)\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n >> apigen\n >> execution_checker\n >> semantic_checker\n )\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run()\n print(distiset[\"default\"][\"train\"][0])\n Example row: {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n \"tools\": [\n {\n \"function\": {\n \"description\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n \"name\": \"final_velocity\",\n \"parameters\": {\n \"properties\": {\n \"acceleration\": {\n \"description\": \"The acceleration of the object.\",\n \"type\": \"number\"\n },\n \"initial_velocity\": {\n \"description\": \"The initial velocity of the object.\",\n \"type\": \"number\"\n },\n \"time\": {\n \"description\": \"The time elapsed.\",\n \"type\": \"number\"\n }\n },\n \"required\": [\n \"initial_velocity\",\n \"acceleration\",\n \"time\"\n ],\n \"type\": \"object\"\n }\n },\n \"type\": \"function\"\n }\n ],\n \"examples\": \"## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\",\n \"query\": \"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\",\n \"answers\": \"[{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\",\n \"distilabel_metadata\": {\n \"raw_input_a_p_i_gen_generator_0\": [\n {\n \"content\": \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\",\n \"role\": \"system\"\n },\n {\n \"content\": \"Here are examples of queries and the corresponding answers for similar functions:\\n## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\n\\nBased on these examples, generate 1 diverse query and answer pairs for the function `final_velocity`.\\nThe detailed function description is the following:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n\\nThese are the available tools to help you:\\n[{'type': 'function', 'function': {'name': 'final_velocity', 'description': 'Calculates the final velocity of an object given its initial velocity, acceleration, and time.', 'parameters': {'type': 'object', 'properties': {'initial_velocity': {'type': 'number', 'description': 'The initial velocity of the object.'}, 'acceleration': {'type': 'number', 'description': 'The acceleration of the object.'}, 'time': {'type': 'number', 'description': 'The time elapsed.'}}, 'required': ['initial_velocity', 'acceleration', 'time']}}}]\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n {\\n \\\"query\\\": \\\"The generated query.\\\",\\n \\\"answers\\\": [\\n {\\n \\\"name\\\": \\\"api_name\\\",\\n \\\"arguments\\\": {\\n \\\"arg_name\\\": \\\"value\\\"\\n ... (more arguments as required)\\n }\\n },\\n ... (more API calls as required)\\n ]\\n }\\n]\\n```\\n\\nNow please generate 1 diverse query and answer pairs following the above format.\",\n \"role\": \"user\"\n }\n ],\n \"raw_input_a_p_i_gen_semantic_checker_0\": [\n {\n \"content\": \"As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\",\n \"role\": \"system\"\n },\n {\n \"content\": \"Given Information:\\n- All Available Functions:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n- User Query: What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\n- Generated Function Calls: [{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\\n- Execution Results: ['9.8']\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query's intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \\\"thought\\\": \\\"Concisely describe your reasoning here\\\",\\n \\\"passes\\\": \\\"yes\\\" or \\\"no\\\"\\n}\\n```\\n\",\n \"role\": \"user\"\n }\n ],\n \"raw_output_a_p_i_gen_generator_0\": \"{\\\"pairs\\\": [\\n {\\n \\\"answers\\\": [\\n {\\n \\\"arguments\\\": {\\n \\\"acceleration\\\": \\\"9.8\\\",\\n \\\"initial_velocity\\\": \\\"0\\\",\\n \\\"time\\\": \\\"10\\\"\\n },\\n \\\"name\\\": \\\"final_velocity\\\"\\n }\\n ],\\n \\\"query\\\": \\\"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\\"\\n }\\n]}\",\n \"raw_output_a_p_i_gen_semantic_checker_0\": \"{\\n \\\"thought\\\": \\\"\\\",\\n \\\"passes\\\": \\\"yes\\\"\\n}\"\n },\n \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n \"keep_row_after_execution_check\": true,\n \"execution_result\": [\n \"9.8\"\n ],\n \"thought\": \"\",\n \"keep_row_after_semantic_check\": true\n}\n -
Read this nice blog post for more information on tools and the reasoning behind get_json_schema : Tool Use, Unified.\u00a0\u21a9 "},{"location":"sections/pipeline_samples/papers/clair/","title":"Contrastive Learning From AI Revisions (CLAIR)","text":"\"Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment\" introduces both Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs, and Anchored Preference Optimization (APO), a controllable and more stable alignment objective. While APO can be found in TRL, we have implemented a task for CLAIR in distilabel . CLAIR is a method for creating preference pairs which minimally revises one output to express a preference, resulting in a more precise learning signal as opposed to conventional methods which use a judge to select a preferred response. The athors from the original paper shared a collection of datasets from CLAIR and APO, where ContextualAI/ultrafeedback_clair_32k corresponds to the CLAIR implementation. "},{"location":"sections/pipeline_samples/papers/clair/#replication","title":"Replication","text":"Note The section is named Replication but in this case we are showing how to use the CLAIR task create revisions for your generations using distilabel . To showcase CLAIR we will be using the CLAIR task implemented in distilabel and we are reusing a small sample of the already generated dataset by ContextualAI ContextualAI/ultrafeedback_clair_32k for testing. "},{"location":"sections/pipeline_samples/papers/clair/#installation","title":"Installation","text":"To reproduce the code below, one will need to install distilabel as follows: pip install \"distilabel>=1.4.0\"\n Depending on the LLM provider you want to use, the requirements may vary, take a look at the dependencies in that case, we are using for the example the free inference endpoints from Hugging Face, but that won't apply for a bigger dataset. "},{"location":"sections/pipeline_samples/papers/clair/#building-blocks","title":"Building blocks","text":"In this case where we already have instructions and their generations, we will just need to load the data and the corresponding CLAIR task for the revisions: CLAIR to generate the revisions. "},{"location":"sections/pipeline_samples/papers/clair/#code","title":"Code","text":"Let's see the full pipeline applied to ContextualAI/ultrafeedback_clair_32k in distilabel : from typing import Any, Dict\n\nfrom datasets import load_dataset\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\n\ndef transform_ultrafeedback(example: Dict[str, Any]) -> Dict[str, Any]:\n return {\n \"task\": example[\"prompt\"],\n \"student_solution\": example[\"rejected\"][1][\"content\"],\n }\n\ndataset = (\n load_dataset(\"ContextualAI/ultrafeedback_clair_32k\", split=\"train\")\n .select(range(10)) #\u00a0We collect just 10 examples\n .map(transform_ultrafeedback) # Apply the transformation to get just the text\n)\n\nwith Pipeline(name=\"CLAIR UltraFeedback sample\") as pipeline:\n clair = CLAIR( # (1)\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096\n }\n )\n )\n\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(dataset=dataset) # (2)\n distiset.push_to_hub(repo_id=\"username/clair-test\", include_script=True) # (3)\n -
This Pipeline uses just CLAIR because we already have the generations, but one can just include a first task to create generations from instructions, and then the revisions with CLAIR. -
Include the dataset directly in the run method for simplicity. -
Push the distiset to the hub with the script for reproducibility. An example dataset can be found at: distilabel-internal-testing/clair-test. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/","title":"DeepSeek Prover","text":"\"DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data\" presents an approach to generate mathematical proofs for theorems generated from informal math problems. This approach shows promising results to advance the capabilities of models towards theorem proving using synthetic data. Until this moment the dataset and the model trained on top of it haven't been opened, let's see how the approach works to reproduce the pipeline using distilabel . The following figure depicts the approach taken to generate the dataset: The authors propose a method for generating Lean 4 proof data from informal mathematical problems. Their approach translates high-school and undergraduate-level mathematical competition problems into formal statements. Here we show how to deal with steps 1 and 2, but the authors ensure the theorems are checked using the lean4 program on the generated proofs, and iterate for a series of steps, fine-tuning a model on the synthetic data (DeepSeek prover 7B), regenerating the dataset, and continue the process until no further improvement is found. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#replication","title":"Replication","text":"Note The section is named Replication but we will show how we can use distilabel to create the different steps outlined in the DeepSeek-Prover approach. We intentionally let some steps out of the pipeline, but this can easily be extended. We will define the components needed to generate a dataset like the one depicted in the previous figure (we won't call lean4 or do the fine-tuning, this last step can be done outside of distilabel ). The different blocks will have all the docstrings as we would have in the internal steps to showcase how they are done, but they can be omitted for brevity. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#installation","title":"Installation","text":"To reproduce the code below, we need to install distilabel as it follows: pip install \"distilabel[hf-inference-endpoints]\"\n We have decided to use InferenceEndpointsLLM , but any other provider with a strong model could work. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#building-blocks","title":"Building blocks","text":"There are three components we needed to define for this pipeline, for the different components in the paper: A task to formalize the original statements, another one to assess the relevance of the theorems, and a final one to generate proofs for the theorems. Note We will use the same LLM for all the tasks, so we will define once and reuse it for the different tasks: llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n)\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverautoformalization","title":"DeepSeekProverAutoFormalization","text":"This Task corresponds to the first step in the figure. Given an informal statement, it will formalize it for us in Lean 4 language, meaning it will translate from an informal statement that could be gathered from the internet, to the lean4 structured language. DeepSeekProverAutoFormalization _PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n examples: Optional[List[str]] = None\n system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n _template: Union[Template, None] = PrivateAttr(...)\n _few_shot: bool = PrivateAttr(default=False)\n\n def load(self) -> None:\n super().load()\n self._template = Template(template_deepseek_prover_auto_formalization)\n\n @property\n def inputs(self) -> List[str]:\n return [\"informal_statement\"]\n\n @property\n def outputs(self):\n return [\"formal_statement\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n informal_statement=input[self.inputs[0]],\n few_shot=bool(self.examples),\n examples=self.examples,\n ),\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"formal_statement\": match}\n Following the paper, they found that the model yields better results if it uses examples in a few shot setting, so this class allows to take some examples to help in generating the formulation. Let's see an example of how we can instantiate it: from textwrap import dedent\n\nexamples = [\n dedent(\"\"\"\n ## Statement in natural language:\n For real numbers k and x:\n If x is equal to (13 - \u221a131) / 4, and\n If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n Then k must be equal to 19/4.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n Given two integers x and y:\n If y is positive (greater than 0),\n And y is less than x,\n And the equation x + y + xy = 80 is true,\n Then x must be equal to 26.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\nauto_formalization = DeepSeekProverAutoFormalization(\n name=\"auto_formalization\",\n input_batch_size=8,\n llm=llm,\n examples=examples\n)\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverscorer","title":"DeepSeekProverScorer","text":"The next Task corresponds to the second step, the model scoring and assessment. It uses an LLM as judge to evaluate the relevance of the theorem, and assigns a score so it can be filtered afterwards. DeepSeekProverScorer template_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\nclass DeepSeekProverScorer(Task):\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n super().load()\n self._template = Template(template_deepseek_prover_scorer)\n\n @property\n def inputs(self) -> List[str]:\n return [\"informal_statement\", \"formal_statement\"]\n\n @property\n def outputs(self):\n return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType:\n return [\n {\n \"role\": \"system\",\n \"content\": self._template.render(),\n },\n {\n \"role\": \"user\",\n \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n },\n ]\n\n @override\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]:\n try:\n result = output.split(\"Natural language:\")[1].strip()\n natural_language, analysis = result.split(\"Analysis:\")\n analysis, assessment = analysis.split(\"Assessment:\")\n natural_language = natural_language.strip()\n analysis = analysis.strip()\n assessment = assessment.strip()\n except Exception:\n natural_language = analysis = assessment = None\n\n return {\n \"natural_language\": natural_language,\n \"analysis\": analysis,\n \"assessment\": assessment\n }\n "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproversolver","title":"DeepSeekProverSolver","text":"The last task is in charge of generating a proof for the theorems generated in the previous steps. DeepSeekProverSolver class DeepSeekProverSolver(Task):\n system_prompt: str = (\n \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n return [\"formal_statement\"]\n\n @property\n def outputs(self):\n return [\"proof\"]\n\n def format_input(self, input: str) -> ChatType:\n prompt = dedent(\"\"\"\n Give me a proof for the following theorem:\n ```lean4\n {theorem}\n ```\"\"\"\n )\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n },\n ]\n\n def format_output(\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]:\n import re\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"proof\": match}\n Additionally, the original pipeline defined in the paper includes a step to check the final proofs using the lean 4 language that we have omitted for simplicity. The fine tuning can be done completely offline, and come back to the pipeline after each iteration/training run. All the docstrings have been removed from the code blocks, but can be seen in the full pipeline. "},{"location":"sections/pipeline_samples/papers/deepseek_prover/#code","title":"Code","text":"Lets's put the building blocks together to create the final pipeline with distilabel . For this example we have generated a sample dataset plaguss/informal-mathematical-statements-tiny of informal mathematical statements starting from casey-martin/multilingual-mathematical-autoformalization, but as the paper mentions, we can create formal statements and it's corresponding proofs starting from informal ones: Click to see the full pipeline deepseek_prover.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom pathlib import Path\nfrom textwrap import dedent\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom jinja2 import Template\nfrom pydantic import PrivateAttr\nfrom typing_extensions import override\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\n\n_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n \"\"\"Task to translate a mathematical problem from natural language to Lean 4.\n\n Note:\n A related dataset (MMA from the paper) can be found in Hugging Face:\n [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n Input columns:\n - informal_statement (`str`): The statement to be formalized using Lean 4.\n\n Output columns:\n - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n Categories:\n - generation\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n - [`Lean 4`](https://github.com/leanprover/lean4).\n\n Examples:\n\n Formalize a mathematical problem from natural language to Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n prover_autoformal = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n )\n\n prover_autoformal.load()\n\n result = next(\n prover_autoformal.process(\n [\n {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n # },\n # 'model_name': 'deepseek-prover'\n # }\n # ]\n ```\n\n Use a few-shot setting to formalize a mathematical problem from natural language to Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n from distilabel.models import InferenceEndpointsLLM\n\n # You can gain inspiration from the following examples to create your own few-shot examples:\n # https://github.com/yangky11/miniF2F-lean4/blob/main/MiniF2F/Valid.lean\n # Consider this as a placeholder for your actual LLM.\n prover_autoformal = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n examples=[\n \"theorem amc12a_2019_p21 (z : \u2102) (h\u2080 : z = (1 + Complex.I) / Real.sqrt 2) :\\n\\n((\u2211 k : \u2124 in Finset.Icc 1 12, z ^ k ^ 2) * (\u2211 k : \u2124 in Finset.Icc 1 12, 1 / z ^ k ^ 2)) = 36 := by\\n\\nsorry\",\n \"theorem amc12a_2015_p10 (x y : \u2124) (h\u2080 : 0 < y) (h\u2081 : y < x) (h\u2082 : x + y + x * y = 80) : x = 26 := by\\n\\nsorry\"\n ]\n )\n\n prover_autoformal.load()\n\n result = next(\n prover_autoformal.process(\n [\n {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n # },\n # 'model_name': 'deepseek-prover'\n # }\n # ]\n ```\n \"\"\"\n\n examples: Optional[List[str]] = None\n system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n _template: Union[Template, None] = PrivateAttr(...)\n _few_shot: bool = PrivateAttr(default=False)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n self._template = Template(template_deepseek_prover_auto_formalization)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"informal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"formal_statement\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": self._template.render(\n informal_statement=input[self.inputs[0]],\n few_shot=bool(self.examples),\n examples=self.examples,\n ),\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"Extracts the formal statement from the Lean 4 output.\"\"\"\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"formal_statement\": match}\n\n\ntemplate_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\n\nclass DeepSeekProverScorer(Task):\n \"\"\"Task to evaluate the quality of a formalized mathematical problem in Lean 4,\n inspired by the DeepSeek-Prover task for scoring.\n\n Note:\n A related dataset (MMA from the paper) can be found in Hugging Face:\n [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n Input columns:\n - informal_statement (`str`): The statement to be formalized using Lean 4.\n - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n Output columns:\n - natural_language (`str`): Explanation for the problem.\n - analysis (`str`): Analysis of the different points defined in the prompt.\n - assessment (`str`): Result of the assessment.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n - [`Lean 4`](https://github.com/leanprover/lean4).\n\n Examples:\n\n Analyse a formal statement in Lean 4:\n\n ```python\n from distilabel.steps.tasks import DeepSeekProverScorer\n from distilabel.models import InferenceEndpointsLLM\n\n # Consider this as a placeholder for your actual LLM.\n prover_scorer = DeepSeekProverAutoFormalization(\n llm=InferenceEndpointsLLM(\n model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n ),\n )\n\n prover_scorer.load()\n\n result = next(\n prover_scorer.process(\n [\n {\"formal_statement\": \"theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\"},\n ]\n )\n )\n # result\n # [\n # {\n # 'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n # 'informal_statement': 'INFORMAL',\n # 'analysis': 'ANALYSIS',\n # 'assessment': 'ASSESSMENT',\n # 'distilabel_metadata': {\n # 'raw_output_deep_seek_prover_scorer_0': 'Natural language:\\nINFORMAL\\nAnalysis:\\nANALYSIS\\nAssessment:\\nASSESSMENT'\n # },\n # 'model_name': 'deepseek-prover-scorer'\n # }\n # ]\n ```\n \"\"\"\n\n _template: Union[Template, None] = PrivateAttr(...)\n\n def load(self) -> None:\n \"\"\"Loads the Jinja2 template.\"\"\"\n super().load()\n\n self._template = Template(template_deepseek_prover_scorer)\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `instruction`.\"\"\"\n return [\"informal_statement\", \"formal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n is the first interaction from the user within a conversation. And the\n `system_prompt` is added as the first message if it exists.\"\"\"\n return [\n {\n \"role\": \"system\",\n \"content\": self._template.render(),\n },\n {\n \"role\": \"user\",\n \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n },\n ]\n\n @override\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n \"\"\"Analyses the formal statement with Lean 4 output and generates an assessment\n and the corresponding informal assessment.\"\"\"\n\n try:\n result = output.split(\"Natural language:\")[1].strip()\n natural_language, analysis = result.split(\"Analysis:\")\n analysis, assessment = analysis.split(\"Assessment:\")\n natural_language = natural_language.strip()\n analysis = analysis.strip()\n assessment = assessment.strip()\n except Exception:\n natural_language = analysis = assessment = None\n\n return {\n \"natural_language\": natural_language,\n \"analysis\": analysis,\n \"assessment\": assessment,\n }\n\n\nclass DeepSeekProverSolver(Task):\n \"\"\"Task to generate a proof for a formal statement (theorem) in lean4.\n\n Input columns:\n - formal_statement (`str`): The formalized statement using Lean 4.\n\n Output columns:\n - proof (`str`): The proof for the formal statement theorem.\n\n Categories:\n - scorer\n - quality\n - response\n\n References:\n - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n \"\"\"\n\n system_prompt: str = (\n \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n )\n\n @property\n def inputs(self) -> List[str]:\n \"\"\"The input for the task is the `formal_statement`.\"\"\"\n return [\"formal_statement\"]\n\n @property\n def outputs(self):\n \"\"\"The output for the task is the proof for the formal statement theorem.\"\"\"\n return [\"proof\"]\n\n def format_input(self, input: str) -> ChatType: # type: ignore\n \"\"\"The input is formatted as a `ChatType`, with a system prompt to guide our model.\"\"\"\n prompt = dedent(\"\"\"\n Give me a proof for the following theorem:\n ```lean4\n {theorem}\n ```\"\"\")\n return [\n {\n \"role\": \"system\",\n \"content\": self.system_prompt,\n },\n {\n \"role\": \"user\",\n \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n },\n ]\n\n def format_output( # type: ignore\n self, output: Union[str, None], input: Dict[str, Any] = None\n ) -> Dict[str, Any]: # type: ignore\n import re\n\n match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n if match:\n match = match.group(1).strip()\n return {\"proof\": match}\n\n\nexamples = [\n dedent(\"\"\"\n ## Statement in natural language:\n For real numbers k and x:\n If x is equal to (13 - \u221a131) / 4, and\n If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n Then k must be equal to 19/4.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n dedent(\"\"\"\n ## Statement in natural language:\n Given two integers x and y:\n If y is positive (greater than 0),\n And y is less than x,\n And the equation x + y + xy = 80 is true,\n Then x must be equal to 26.\n ## Formalized:\n theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\n\nwith Pipeline(name=\"test_deepseek_prover\") as pipeline:\n data_loader = LoadDataFromHub(\n repo_id=\"plaguss/informal-mathematical-statements-tiny\",\n split=\"val\",\n batch_size=8,\n )\n\n llm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n )\n auto_formalization = DeepSeekProverAutoFormalization(\n name=\"auto_formalization\", input_batch_size=8, llm=llm, examples=examples\n )\n prover_scorer = DeepSeekProverScorer(\n name=\"prover_scorer\",\n input_batch_size=8,\n llm=llm,\n )\n proof_generator = DeepSeekProverSolver(\n name=\"proof_generator\", input_batch_size=8, llm=llm\n )\n\n (data_loader >> auto_formalization >> prover_scorer >> proof_generator)\n\n\nif __name__ == \"__main__\":\n import argparse\n\n parser = argparse.ArgumentParser()\n parser.add_argument(\n \"-d\",\n \"--dry-run\",\n action=argparse.BooleanOptionalAction,\n help=\"Do a dry run for testing purposes.\",\n )\n args = parser.parse_args()\n\n pipeline_parameters = {\n data_loader.name: {\"split\": \"val\"},\n auto_formalization.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.6,\n \"top_p\": 0.9,\n \"max_new_tokens\": 512,\n }\n }\n },\n prover_scorer.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.6,\n \"top_p\": 0.9,\n \"max_new_tokens\": 512,\n }\n }\n },\n }\n\n ds_name = \"test_deepseek_prover\"\n\n if args.dry_run:\n distiset = pipeline.dry_run(batch_size=1, parameters=pipeline_parameters)\n distiset.save_to_disk(Path.home() / f\"Downloads/{ds_name}\")\n\n import pprint\n\n pprint.pprint(distiset[\"default\"][\"train\"][0])\n\n else:\n distiset = pipeline.run(parameters=pipeline_parameters)\n distiset.push_to_hub(ds_name, include_script=True)\n The script can be run run for a dry run or not, depending on the argument (the pipeline will run without dry run by default), and will be pushed to the hub with the name your_username/test_deepseek_prover : python deepseek_prover.py [-d | --dry-run | --no-dry-run]\n Final dataset: plaguss/test_deepseek_prover. "},{"location":"sections/pipeline_samples/papers/deita/","title":"DEITA","text":"DEITA (Data-Efficient Instruction Tuning for Alignment) studies an automatic data selection process by first quantifying the data quality based on complexity, quality and diversity. Second, select the best potential combination from an open-source dataset that would fit into the budget you allocate to tune your own LLM. In most setting we cannot allocate unlimited resources for instruction-tuning LLMs. Therefore, the DEITA authors investigated how to select qualitative data for instruction tuning based on the principle of fewer high-quality samples. Liu et al. tackle the issue of first defining good data and second identifying it to respect an initial budget to instruct-tune your LLM. The strategy utilizes LLMs to replace human effort in time-intensive data quality tasks on instruction-tuning datasets**. DEITA introduces a way to measure data quality across three critical dimensions: complexity, quality and diversity. You can see that we see again the dataset of instructions/responses and we kind of reproducing the second step when we learn how to optimize the responses according to an instruction by comparing several possibilities. "},{"location":"sections/pipeline_samples/papers/deita/#datasets-and-budget","title":"Datasets and budget","text":"We will dive deeper into the whole process. We will investigate each stage to efficiently select the final dataset used for supervised fine-tuning with a budget constraint. We will tackle technical challenges by explaining exactly how you would assess good data as presented in the paper. As a reminder, we're looking for a strategy to automatically select good data for the instruction-tuning step when you want to fine-tune an LLM to your own use case taking into account a resource constraint. This means that you cannot blindly train a model on any data you encounter on the internet. The DEITA authors assume that you have access to open-source datasets that fit your use case. This may not be the case entirely. But with open-source communities tackling many use cases, with projects such as BLOOM or AYA, it's likely that your use case will be tackled at some point. Furthermore, you could generate your own instruction/response pairs with methods such as self-generated instructions using distilabel. This tutorial assumes that we have a data pool with excessive samples for the project's cost constraint. In short, we aim to achieve adequate performance from fewer samples. The authors claim that the subsample size \"correlates proportionally with the computation consumed in instruction tuning\". Hence on a first approximation, reducing the sample size means reducing computation consumption and so the total development cost. Reproducing the paper notations, we will associate the budget m to a number of instruction/response pairs that you can set depending on your real budget. To match the experimental set-up, dataset X_sota is a meta-dataset combining major open-source datasets available to instruct-tune LLMs. This dataset is composed of ShareGPT (58k instruction/response pairs), UltraChat (105k instruction/response pairs) and WizardLM (143k instruction/response pairs). It sums to more than 300k instruction/response pairs. We aim to reduce the final subsample to 6k instruction/response pairs. "},{"location":"sections/pipeline_samples/papers/deita/#setup-the-notebook-and-packages","title":"Setup the notebook and packages","text":"Let's prepare our dependencies: pip install \"distilabel[openai,hf-transformers]>=1.0.0\"\npip install pynvml huggingface_hub argilla\n Import distilabel: from distilabel.models import TransformersLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import ConversationTemplate, DeitaFiltering, ExpandColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import ComplexityScorer, EvolInstruct, EvolQuality, GenerateEmbeddings, QualityScorer\n Define the distilabel Pipeline and load the dataset from the Hugging Face Hub. pipeline = Pipeline(name=\"DEITA\")\n\nload_data = LoadDataFromHub(\n name=\"load_data\", batch_size=100, output_mappings={\"prompt\": \"instruction\"}, pipeline=pipeline\n)\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-instruct-generate-instructions-with-an-llm","title":"EVOL-INSTRUCT: Generate Instructions with an LLM","text":"Evol-Instruct automates the creation of complex instruction data for training large language models (LLMs) by progressively rewriting an initial set of instructions into more complex forms. This generated data is then used to fine-tune a model named WizardLM. Evaluations show that instructions from Evol-Instruct are superior to human-created ones, and WizardLM achieves performance close to or exceeding GPT3.5-turbo in many skills. In distilabel, we initialise each step of the data generation pipeline. Later, we'll connect them together. evol_instruction_complexity = EvolInstruct(\n name=\"evol_instruction_complexity\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n num_evolutions=5,\n store_evolutions=True,\n generate_answers=True,\n include_original_instruction=True,\n pipeline=pipeline,\n)\n\nevol_instruction_complexity.load()\n\n_evolved_instructions = next(evol_instruction_complexity.process(\n ([{\"instruction\": \"How many fish are there in a dozen fish?\"}]))\n)\n\nprint(*_evolved_instructions, sep=\"\\n\")\n Output: ( 1, 'How many fish are there in a dozen fish?')\n( 2, 'How many rainbow trout are there in a dozen rainbow trout?')\n( 3, 'What is the average weight in pounds of a dozen rainbow trout caught in a specific river in Alaska during the month of May?')\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-complexity-evaluate-complexity-of-generated-instructions","title":"EVOL COMPLEXITY: Evaluate complexity of generated instructions","text":"The second step is the evaluation of complexity for an instruction in a given instruction-response pair. Like EVOL-INSTRUCT, this method uses LLMs instead of humans to automatically improve instructions, specifically through their complexity. From any instruction-response pair, \\((I, R)\\), we first generate new instructions following the In-Depth Evolving Response. We generate more complex instructions through prompting, as explained by authors, by adding some constraints or reasoning steps. Let\\'s take an example from GPT-4-LLM which aims to generate observations by GPT-4 to instruct-tune LLMs with supervised fine-tuning. And, we have the instruction \\(instruction_0\\): instruction_0 = \"Give three tips for staying healthy.\"\n To make it more complex, you can use, as the authors did, some prompt templates to add constraints or deepen the instruction. They provided some prompts in the paper appendix. For instance, this one was used to add constraints: PROMPT = \"\"\"I want you act as a Prompt Rewriter.\nYour objective is to rewrite a given prompt into a more complex version to\nmake those famous AI systems (e.g., ChatGPT and GPT4) a bit harder to handle.\nBut the rewritten prompt must be reasonable and must be understood and\nresponded by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt#:. Also, please do not omit the input in #Given Prompt#.\nYou SHOULD complicate the given prompt using the following method:\nPlease add one more constraints/requirements into #Given Prompt#\nYou should try your best not to make the #Rewritten Prompt# become verbose,\n#Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.\n\u2018#Given Prompt#\u2019, \u2018#Rewritten Prompt#\u2019, \u2018given prompt\u2019 and \u2018rewritten prompt\u2019\nare not allowed to appear in #Rewritten Prompt#\n#Given Prompt#:\n<Here is instruction>\n#Rewritten Prompt#:\n\"\"\"\n Prompting this to an LLM, you automatically get a more complex instruction, called \\(instruction_1\\), from an initial instruction \\(instruction_0\\): instruction_1 = \"Provide three recommendations for maintaining well-being, ensuring one focuses on mental health.\"\n With sequences of evolved instructions, we use a further LLM to automatically rank and score them. We provide the 6 instructions at the same time. By providing all instructions together, we force the scoring model to look at minor complexity differences between evolved instructions. Encouraging the model to discriminate between instructions. Taking the example below, \\(instruction_0\\) and \\(instruction_1\\) could deserve the same score independently, but when compared together we would notice the slight difference that makes \\(instruction_1\\) more complex. In distilabel , we implement this like so: instruction_complexity_scorer = ComplexityScorer(\n name=\"instruction_complexity_scorer\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n input_mappings={\"instructions\": \"evolved_instructions\"},\n pipeline=pipeline,\n)\n\nexpand_evolved_instructions = ExpandColumns(\n name=\"expand_evolved_instructions\",\n columns=[\"evolved_instructions\", \"answers\", \"scores\"],\n output_mappings={\n \"evolved_instructions\": \"evolved_instruction\",\n \"answers\": \"answer\",\n \"scores\": \"evol_instruction_score\",\n },\n pipeline=pipeline,\n)\n\ninstruction_complexity_scorer.load()\n\n_evolved_instructions = next(instruction_complexity_scorer.process(([{\"evolved_instructions\": [PROMPT + instruction_1]}])))\n\nprint(\"Original Instruction:\")\nprint(instruction_1)\nprint(\"\\nEvolved Instruction:\")\nprint(_evolved_instructions[0][\"evolved_instructions\"][0].split(\"#Rewritten Prompt#:\\n\")[1])\n Output: Original Instruction:\nProvide three recommendations for maintaining well-being, ensuring one focuses on mental health.\n\nEvolved Instruction:\nSuggest three strategies for nurturing overall well-being, with the stipulation that at least one explicitly addresses the enhancement of mental health, incorporating evidence-based practices.\n "},{"location":"sections/pipeline_samples/papers/deita/#evol-quality-quality-evaluation","title":"EVOL-QUALITY: Quality Evaluation","text":"Now that we have scored the complexity of the instructions, we will focus on the quality of the responses. Similar to EVOL COMPLEXITY, the authors introduced EVOL QUALITY, a method based on LLMs, instead of humans, to automatically score the quality of the response. From an instruction-response pair, \\((I, R)\\), the goal is to make the response evolve into a more helpful and relevant response. The key difference is that we need to also provide the first instruction to guide evolution. Let's take back our example from GPT-4-LLM. Here we have the response \\(response_0\\) and its initial instruction \\(instruction_0\\): instruction_0 = \"Give three tips for staying healthy.\"\nreponse_0 = \"1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\"\n Again the authors provided several prompts you could use to make your response evolve according to some guidelines. For example, this one was used to enrich the answer: PROMPT = \"\"\"I want you to act as a Response Rewriter\nYour goal is to enhance the quality of the response given by an AI assistant\nto the #Given Prompt# through rewriting.\nBut the rewritten response must be reasonable and must be understood by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt# and #Given Response#. Also, please do not omit the input\nin #Given Prompt#.\nYou Should enhance the quality of the response using the following method:\nPlease make the Response more in-depth\nYou should try your best not to make the #Rewritten Response# become verbose,\n#Rewritten Response# can only add 10 to 20 words into #Given Response#.\n\u2018#Given Response#\u2019, \u2018#Rewritten Response#\u2019, \u2018given response\u2019 and \u2018rewritten response\u2019\nare not allowed to appear in #Rewritten Response#\n#Given Prompt#:\n<instruction_0>\n#Given Response#:\n<response_0>\n#Rewritten Response#:\n\"\"\"\n Prompting this to an LLM, you will automatically get a more enriched response, called \\(response_1\\), from an initial response \\(response_0\\) and initial instruction \\(instruction_0\\): evol_response_quality = EvolQuality(\n name=\"evol_response_quality\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n num_evolutions=5,\n store_evolutions=True,\n include_original_response=True,\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"response\": \"answer\",\n },\n pipeline=pipeline,\n)\n\nevol_response_quality.load()\n\n_evolved_responses = next(evol_response_quality.process([{\"instruction\": PROMPT + instruction_0, \"response\": reponse_0}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\nprint(\"\\nEvolved Response:\")\nprint(*_evolved_responses[0]['evolved_responses'], sep=\"\\n\")\n And now, as in EVOL COMPLEXITY you iterate through this path and use different prompts to make your responses more relevant, helpful or creative. In the paper, they make 4 more iterations to get 5 evolved responses \\((R0, R1, R2, R3, R4)\\) which makes 5 different responses for one initial instruction at the end of this step. response_quality_scorer = QualityScorer(\n name=\"response_quality_scorer\",\n llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"responses\": \"evolved_responses\",\n },\n pipeline=pipeline,\n)\n\nexpand_evolved_responses = ExpandColumns(\n name=\"expand_evolved_responses\",\n columns=[\"evolved_responses\", \"scores\"],\n output_mappings={\n \"evolved_responses\": \"evolved_response\",\n \"scores\": \"evol_response_score\",\n },\n pipeline=pipeline,\n)\n\nresponse_quality_scorer.load()\n\n_scored_responses = next(response_quality_scorer.process([{\"instruction\": PROMPT + instruction_0, \"responses\": _evolved_responses[0]['evolved_responses']}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\n\nprint(\"\\nScore, Evolved Response:\")\nprint(*zip(_scored_responses[0][\"scores\"], _evolved_responses[0]['evolved_responses']), sep=\"\\n\")\n Output: Original Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n\nScore, Evolved Response:\n(4.0, 'Here are three essential tips for maintaining good health: \\n1. Prioritize regular exercise \\n2. Eat a balanced diet with plenty of fruits and vegetables \\n3. Get an adequate amount of sleep each night.')\n(2.0, 'Here are three effective strategies to maintain a healthy lifestyle.')\n(5.0, 'Here are three practical tips to maintain good health: Ensure a balanced diet, engage in regular exercise, and prioritize sufficient sleep. These practices support overall well-being.')\n "},{"location":"sections/pipeline_samples/papers/deita/#improving-data-diversity","title":"Improving Data Diversity","text":"One main component of good data to instruct-tune LLMs is diversity. Real world data can often contain redundancy due repetitive and homogeneous data. The authors of the DEITA paper tackle the challenge of ensuring data diversity in the instruction tuning LLMs to avoid the pitfalls of data redundancy that can lead to over-fitting or poor generalization. They propose an embedding-based method to filter data for diversity. This method, called Repr Filter, uses embeddings generated by the Llama 1 13B model to represent instruction-response pairs in a vector space. The diversity of a new data sample is assessed based on the cosine distance between its embedding and that of its nearest neighbor in the already selected dataset. If this distance is greater than a specified threshold, the sample is considered diverse and is added to the selection. This process prioritizes diversity by assessing each sample's contribution to the variety of the dataset until the data selection budget is met. This approach effectively maintains the diversity of the data used for instruction tuning, as demonstrated by the DEITA models outperforming or matching state-of-the-art models with significantly less training data. In this implementation of DEITA we use the hidden state of the last layer of the Llama 2 model to generate embeddings, instead of a sentence transformer model, because we found that it improved the diversity of the data selection. generate_conversation = ConversationTemplate(\n name=\"generate_conversation\",\n input_mappings={\n \"instruction\": \"evolved_instruction\",\n \"response\": \"evolved_response\",\n },\n pipeline=pipeline,\n)\n\ngenerate_embeddings = GenerateEmbeddings(\n name=\"generate_embeddings\",\n llm=TransformersLLM(\n model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n device=\"cuda\",\n torch_dtype=\"float16\",\n ),\n input_mappings={\"text\": \"conversation\"},\n input_batch_size=5,\n pipeline=pipeline,\n)\n\ndeita_filtering = DeitaFiltering(name=\"deita_filtering\", pipeline=pipeline)\n "},{"location":"sections/pipeline_samples/papers/deita/#build-the-distilabel-pipeline","title":"Build the \u2697 distilabel Pipeline ","text":"Now we're ready to build a distilabel pipeline using the DEITA method: load_data.connect(evol_instruction_complexity)\nevol_instruction_complexity.connect(instruction_complexity_scorer)\ninstruction_complexity_scorer.connect(expand_evolved_instructions)\nexpand_evolved_instructions.connect(evol_response_quality)\nevol_response_quality.connect(response_quality_scorer)\nresponse_quality_scorer.connect(expand_evolved_responses)\nexpand_evolved_responses.connect(generate_conversation)\ngenerate_conversation.connect(generate_embeddings)\ngenerate_embeddings.connect(deita_filtering)\n Now we can run the pipeline. We use the step names to reference them in the pipeline configuration: distiset = pipeline.run(\n parameters={\n \"load_data\": {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-50\",\n \"split\": \"train\",\n },\n \"evol_instruction_complexity\": {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n },\n \"instruction_complexity_scorer\": {\n \"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}\n },\n \"evol_response_quality\": {\n \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n },\n \"response_quality_scorer\": {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}},\n \"deita_filtering\": {\"data_budget\": 500, \"diversity_threshold\": 0.04},\n },\n use_cache=False,\n)\n We can push the results to the Hugging Face Hub: distiset.push_to_hub(\"distilabel-internal-testing/deita-colab\")\n "},{"location":"sections/pipeline_samples/papers/deita/#results","title":"Results","text":"Again, to show the relevance of EVOL QUALITY method, the authors evaluated on the MT-bench models fine-tuned with different data selections according to how we defined quality responses according to an instruction. Each time they selected 6k data according to the quality score: Credit: Liu et al. (2023) The score is much better when selecting data with the EVOL QUALITY method than when we select randomly or according to the length, making a more qualitative response if longer. Nevertheless, we see that the margin we may have seen in the complexity score is thinner. And we'll discuss the strategy in a later part. Nevertheless, this strategy looks to improve the fine-tuning compared to the baselines and now we're interested in mixing quality and complexity assessment with a diversity evaluation to find the right trade-off in our selection process. "},{"location":"sections/pipeline_samples/papers/deita/#conclusion","title":"Conclusion","text":"In conclusion, if you are looking for some efficient method to align an open-source LLM to your business case with a constrained budget, the solutions provided by DEITA are really worth the shot. This data-centric approach enables one to focus on the content of the dataset to have the best results instead of \"just\" scaling the instruction-tuning with more, and surely less qualitative, data. In a nutshell, the strategy developed, through automatically scoring instructions-responses, aims to substitute the human preference step proprietary models such as GPT-4 have been trained with. There are a few improvements we could think about when it comes to how to select the good data, but it opens a really great way in instruct-tuning LLM with lower computational needs making the whole process intellectually relevant and more sustainable than most of the other methods. We'd be happy to help you out with aligning an LLM with your business case drawing inspiration from such a methodology. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/","title":"Instruction Backtranslation","text":"\"Self Alignment with Instruction Backtranslation\" presents a scalable method to build high-quality instruction following a language model by automatically labeling human-written text with corresponding instructions. Their approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high-quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model. Their self-training approach assumes access to a base language model, a small amount of seed data, and a collection of unlabelled examples, e.g. a web corpus. The unlabelled data is a large, diverse set of human-written documents that includes writing about all manner of topics humans are interested in \u2013 but crucially is not paired with instructions. A first key assumption is that there exists some subset of this very large human-written text that would be suitable as gold generations for some user instructions. A second key assumption is that they can predict instructions for these candidate gold answers that can be used as high-quality example pairs to train an instruction-following model. Their overall process, called instruction back translation performs two core steps: -
Self-augment: Generate instructions for unlabelled data, i.e. the web corpus, to produce candidate training data of (instruction, output) pairs for instruction tuning. -
Self-curate: Self-select high-quality demonstration examples as training data to finetune the base model to follow instructions. This approach is done iteratively where a better intermediate instruction-following model can improve on selecting data for finetuning in the next iteration. This replication covers the self-curation step i.e. the second/latter step as mentioned above, so as to be able to use the proposed prompting approach to rate the quality of the generated text, which can either be synthetically generated or real human-written text. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#replication","title":"Replication","text":"To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#installation","title":"Installation","text":"To replicate Self Alignment with Instruction Backtranslation one will need to install distilabel as it follows: pip install \"distilabel[hf-inference-endpoints,openai]>=1.0.0\"\n And since we will be using InferenceEndpointsLLM (installed via the extra hf-inference-endpoints ) we will need deploy those in advance either locally or in the Hugging Face Hub (alternatively also the serverless endpoints can be used, but most of the times the inference times are slower, and there's a limited quota to use those as those are free) and set both the HF_TOKEN (to use the InferenceEndpointsLLM ) and the OPENAI_API_KEY environment variable value (to use the OpenAILLM ). "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#building-blocks","title":"Building blocks","text":" LoadDataFromHub : Generator Step to load a dataset from the Hugging Face Hub. TextGeneration : Task to generate responses for a given instruction using an LLM. InferenceEndpointsLLM : LLM that runs a model from an Inference Endpoint in the Hugging Face Hub. InstructionBacktranslation : Task that generates a score and a reason for a response for a given instruction using the Self Alignment with Instruction Backtranslation prompt. OpenAILLM : LLM that loads a model from OpenAI. "},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to replicate Self Alignment with Instruction Backtranslation. from distilabel.models import InferenceEndpointsLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub, KeepColumns\nfrom distilabel.steps.tasks import InstructionBacktranslation, TextGeneration\n\n\nwith Pipeline(name=\"self-alignment-with-instruction-backtranslation\") as pipeline:\n load_hub_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n )\n\n text_generation = TextGeneration(\n name=\"text_generation\",\n llm=InferenceEndpointsLLM(\n base_url=\"<INFERENCE_ENDPOINT_URL>\",\n tokenizer_id=\"argilla/notus-7b-v1\",\n model_display_name=\"argilla/notus-7b-v1\",\n ),\n input_batch_size=10,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n instruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=OpenAILLM(model=\"gpt-4\"),\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"instruction\",\n \"generation\",\n \"generation_model\",\n \"score\",\n \"reason\",\n \"scoring_model\",\n ],\n )\n\n load_hub_dataset >> text_generation >> instruction_backtranslation >> keep_columns\n Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n load_hub_dataset.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n instruction_backtranslation.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n },\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"instruction-backtranslation-instruction-dataset\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/papers/math_shepherd/","title":"Create datasets to train a Process Reward Model using Math-Shepherd","text":"This example will introduce Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations, an innovative math process reward model (PRM) which assigns reward scores to each step of math problem solutions. Specifically, we will present a recipe to create datasets to train such models. The final sections contain 2 pipeline examples to run the pipeline depending with more or less resources. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#replica","title":"Replica","text":"Unlike traditional models that only look at final answers (Output Reward Models or ORM), this system evaluates each step of a mathematical solution and assigns reward scores to individual solution steps. Let's see the Figure 2 from the paper, which makes a summary of the labelling approach presented in their work. In the traditional ORM approach, the annotation was done depending on the final outcome, while the Process Reward Model (PRM) allows labelling the different steps that lead to a solution, making for a richer set of information. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#steps-involved","title":"Steps involved","text":" -
MathShepherdGenerator : This step is in charge of generating solutions for the instruction. Depending on the value set for the M , this step can be used to generate both the golden_solution , to be used as a reference for the labeller, or the set of solutions to be labelled. For the solutions column we want some diversity, to allow the model to reach both good and bad solutions, so we have a representative sample for the labeller, so it may be better to use a \"weaker\" model. -
MathShepherdCompleter . This task does the job of the completer in the paper, generating completions as presented in Figure 2, section 3.3.2. It doesn't generate a column on it's own, but updates the steps generated in the solutions column from the MathShepherdGenerator , using as reference to label the data, the golden_solution . So in order for this step to work, we need both of this columns in our dataset. Depending on the type of dataset, we may already have access to the golden_solution , even if it's with a different name, but it's not the same for the solutions . -
FormatPRM . This step does the auxiliary job of preparing the data to follow the format defined in the paper of having two columns input and label . After running the MathShepherdCompleter , we have raw data that can be formatted as the user want. Using ExpandColumns and this step, one can directly obtain the same format presented in the dataset shared in the paper: peiyi9979/Math-Shepherd. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#data-preparation","title":"Data preparation","text":"For this example, just as the original paper, we are using the openai/gsm8k dataset. We only need a dataset with instructions to be solved (in this case it corresponds to the question column), and we can generate everything else using our predefined steps. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#building-the-pipeline","title":"Building the pipeline","text":"The pipeline uses openai/gsm8k as reference, but the pipeline can be applied to different datasets, keep in mind the prompts can be modified with the current definition, by tweaking the extra_rules and few_shots in each task: from datasets import load_dataset\n\nfrom distilabel.steps.tasks import MathShepherdCompleter, MathShepherdGenerator, FormatPRM\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\n\nds_name = \"openai/gsm8k\"\n\nds = load_dataset(ds_name, \"main\", split=\"test\").rename_column(\"question\", \"instruction\").select(range(3)) # (1)\n\nwith Pipeline(name=\"Math-Shepherd\") as pipe:\n model_id_70B = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n model_id_8B = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n\n llm_70B = InferenceEndpointsLLM(\n model_id=model_id_70B,\n tokenizer_id=model_id_70B,\n generation_kwargs={\"max_new_tokens\": 1024, \"temperature\": 0.6},\n )\n llm_8B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 2048, \"temperature\": 0.6},\n ) # (2)\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_70B,\n ) # (3)\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_8B,\n use_default_structured_output=True, # (9)\n M=5\n ) #\u00a0(4)\n completer = MathShepherdCompleter(\n name=\"completer\",\n llm=llm_8B,\n use_default_structured_output=True,\n N=4\n ) # (5)\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n ) #\u00a0(6)\n formatter = FormatPRM(name=\"format_prm\") # (7)\n\n [generator_golden, generator] >> combine >> completer >> expand >> formatter # (8)\n -
Will use just 3 rows from the sample dataset, and rename the \"question\" to \"instruction\", to set the expected value for the MathShepherdGenerator . -
We will use 2 different LLMs, meta-llama/Meta-Llama-3.1-70B-Instruct (a stronger model for the golden_solution ) and meta-llama/Meta-Llama-3.1-8B-Instruct (a weaker one to generate candidate solutions, and the completions). -
This MathShepherdGenerator task, that uses the stronger model, will generate the golden_solution for us, the step by step solution for the task. -
Another MathShepherdGenerator task, but in this case using the weaker model will generate candidate solutions (M=5 in total). -
Now the MathShepherdCompleter task will generate n=4 completions for each step of each candidate solution in the solutions column, and label them using the golden_solution as shown in Figure 2 in the paper. This step will add the label (it uses [+ and -] tags following the implementation in the paper, but these values can be modified) to the solutions column in place, instead of generating an additional column, but the intermediate completions won't be shown at the end. -
The ExpandColumns step expands the solution to match the instruction, so if we had set M=5, we would now have 5x instruction-pair solutions. We set the split_statistics to True to ensure the distilabel_metadata is split accordingly, othwerwise the number of tokens for each solution would count as the tokens needed for the whole list of solutions generated. One can omit both this and the following step and process the data for training as preferred. -
And finally, the FormatPRM generates two columns: input and label which prepare the data for training as presented in the original Math-Shepherd dataset. -
Both the generator_golden and generator can be run in parallel as there's no dependency between them, and after that we combine the results and pass them to the completer . Finally, we use the expand and formatter prepare the data in the expected format to train the Process Reward Model as defined in the original paper. -
Generate structured outputs to ensure it's easier to parse them, otherwise the models can fail a lot of times with an easy to parse list. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#script-and-final-dataset","title":"Script and final dataset","text":"To see all the pieces in place, take a look at the full pipeline: Run python examples/pipe_math_shepherd.py\n Full pipeline pipe_math_shepherd.py# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\nfrom distilabel.steps.tasks import (\n FormatPRM,\n MathShepherdCompleter,\n MathShepherdGenerator,\n)\n\nds_name = \"openai/gsm8k\"\n\nds = (\n load_dataset(ds_name, \"main\", split=\"test\")\n .rename_column(\"question\", \"instruction\")\n .select(range(3))\n)\n\n\nwith Pipeline(name=\"Math-Shepherd\") as pipe:\n model_id_70B = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n model_id_8B = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n\n llm_70B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 1024, \"temperature\": 0.5},\n )\n llm_8B = InferenceEndpointsLLM(\n model_id=model_id_8B,\n tokenizer_id=model_id_8B,\n generation_kwargs={\"max_new_tokens\": 2048, \"temperature\": 0.7},\n )\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_70B,\n )\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_8B,\n M=5,\n )\n completer = MathShepherdCompleter(name=\"completer\", llm=llm_8B, N=4)\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n )\n formatter = FormatPRM(name=\"format_prm\")\n [generator_golden, generator] >> combine >> completer >> expand >> formatter\n\n\nif __name__ == \"__main__\":\n distiset = pipe.run(use_cache=False, dataset=ds)\n distiset.push_to_hub(\"plaguss/test_math_shepherd_prm\")\n The resulting dataset can be seen at: plaguss/test_math_shepherd_prm. "},{"location":"sections/pipeline_samples/papers/math_shepherd/#pipeline-with-vllm-and-ray","title":"Pipeline with vLLM and ray","text":"This section contains an alternative way of running the pipeline with a bigger outcome. To showcase how to scale the pipeline, we are using for the 3 generating tasks Qwen/Qwen2.5-72B-Instruct, highly improving the final quality as it follows much closer the prompt given. Also, we are using vLLM and 3 nodes (one per task in this case), to scale up the generation process. Math-Shepherd's bigger pipeline from datasets import load_dataset\n\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, ExpandColumns\nfrom distilabel.steps.tasks import (\n FormatPRM,\n MathShepherdCompleter,\n MathShepherdGenerator,\n)\n\nds_name = \"openai/gsm8k\"\n\nds = (\n load_dataset(ds_name, \"main\", split=\"test\")\n .rename_column(\"question\", \"instruction\")\n)\n\n\nwith Pipeline(name=\"Math-Shepherd\").ray() as pipe: # (1)\n\n model_id_72B = \"Qwen/Qwen2.5-72B-Instruct\"\n\n llm_72B = vLLM(\n model=model_id_72B,\n tokenizer=model_id_72B,\n extra_kwargs={\n \"tensor_parallel_size\": 8, # Number of GPUs per node\n \"max_model_len\": 2048,\n },\n generation_kwargs={\n \"temperature\": 0.5,\n \"max_new_tokens\": 4096,\n },\n )\n\n generator_golden = MathShepherdGenerator(\n name=\"golden_generator\",\n llm=llm_72B,\n input_batch_size=50,\n output_mappings={\"model_name\": \"model_name_golden_generator\"},\n resources=StepResources(replicas=1, gpus=8) # (2)\n )\n generator = MathShepherdGenerator(\n name=\"generator\",\n llm=llm_72B,\n input_batch_size=50,\n M=5,\n use_default_structured_output=True,\n output_mappings={\"model_name\": \"model_name_generator\"},\n resources=StepResources(replicas=1, gpus=8)\n )\n completer = MathShepherdCompleter(\n name=\"completer\", \n llm=llm_72B,\n N=8,\n use_default_structured_output=True,\n output_mappings={\"model_name\": \"model_name_completer\"},\n resources=StepResources(replicas=1, gpus=8)\n )\n\n combine = CombineOutputs()\n\n expand = ExpandColumns(\n name=\"expand_columns\",\n columns=[\"solutions\"],\n split_statistics=True,\n\n )\n formatter = FormatPRM(name=\"format_prm\", format=\"trl\") # (3)\n\n [generator_golden, generator] >> combine >> completer >> expand >> formatter\n\n\nif __name__ == \"__main__\":\n distiset = pipe.run(use_cache=False, dataset=ds, dataset_batch_size=50)\n if distiset:\n distiset.push_to_hub(\"plaguss/test_math_shepherd_prm_ray\")\n -
Transform the pipeline to run using ray backend. -
Assign the resources: number of replicas 1 as we want a single instance of the task in a node, and number of GPUs equals to 8, using a whole node. Given that we defined the script in the slurm file to use 3 nodes, this will use all the 3 available nodes, with 8 GPUs for each of these tasks. -
Prepare the columns in the format expected by TRL for training. Click to see the slurm file used to run the previous pipeline. It's our go to slurm file, using 3 8xH100 nodes. Slurm file #!/bin/bash\n#SBATCH --job-name=math-shepherd-test-ray\n#SBATCH --partition=hopper-prod\n#SBATCH --qos=normal\n#SBATCH --nodes=3\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1\n#SBATCH --gpus-per-node=8\n#SBATCH --output=./logs/%x-%j.out\n#SBATCH --err=./logs/%x-%j.err\n#SBATCH --time=48:00:00\n\nset -ex\n\nmodule load cuda/12.1\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\nsource .venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nsrun --nodes=1 --ntasks=1 -w \"$head_node\" \\\n ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n --dashboard-host=0.0.0.0 \\\n --dashboard-port=8265 \\\n --temp-dir=\"$head_tmp_dir\" \\\n --block &\n\n# Give some time to head node to start...\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n node_i=${nodes_array[$i]}\n worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n echo \"Starting WORKER $i at $node_i\"\n srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n ray start --address \"$ip_head\" \\\n --temp-dir=\"$worker_tmp_dir\" \\\n --block &\n sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\nsleep 60\n\n# Finally submit the job to the cluster\nRAY_ADDRESS=\"http://$head_node_ip:8265\" ray job submit --working-dir pipeline -- python -u pipeline_math_shepherd_ray.py\n Final dataset The resulting dataset can be seen at: plaguss/test_math_shepherd_prm_ray. "},{"location":"sections/pipeline_samples/papers/prometheus/","title":"Prometheus 2","text":"\"Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models\" presents Prometheus 2, a new and more powerful evaluator LLM compared to Prometheus (its predecessor) presented in \"Prometheus: Inducing Fine-grained Evaluation Capability in Language Models\"; since GPT-4, as well as other proprietary LLMs, are commonly used to assess the quality of the responses for various LLMs, but there are concerns about transparency, controllability, and affordability, that motivate the need of open-source LLMs specialized in evaluations. Existing open evaluator LMs exhibit critical shortcomings: - They issue scores that significantly diverge from those assigned by humans.
- They lack the flexibility to perform both direct assessment and pairwise ranking, the two most prevalent forms of assessment.
Additionally, they do not possess the ability to evaluate based on custom evaluation criteria, focusing instead on general attributes like helpfulness and harmlessness. Prometheus 2 is capable of processing both direct assessment and pair-wise ranking formats grouped with user-defined evaluation criteria. Prometheus 2 released two variants: prometheus-eval/prometheus-7b-v2.0 : fine-tuned on top of mistralai/Mistral-7B-Instruct-v0.2 prometheus-eval/prometheus-8x7b-v2.0 : fine-tuned on top of mistralai/Mixtral-8x7B-Instruct-v0.1 Both models have been fine-tuned for both direct assessment and pairwise ranking tasks i.e. assessing the quality of a single isolated response for a given instruction with or without a reference answer and assessing the quality of one response against another one for a given instruction with or without a reference answer, respectively. On four direct assessment benchmarks and four pairwise ranking benchmarks, Prometheus 2 scores the highest correlation and agreement with humans and proprietary LM judges among all tested open evaluator LMs. Their models, code, and data are all publicly available at prometheus-eval/prometheus-eval . "},{"location":"sections/pipeline_samples/papers/prometheus/#replication","title":"Replication","text":"Note The section is named Replication but in this case we're not replicating the Prometheus 2 paper per se, but rather showing how to use the PrometheusEval task implemented within distilabel to evaluate the quality of the responses from a given instruction using the Prometheus 2 model. To showcase Prometheus 2 we will be using the PrometheusEval task implemented in distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. "},{"location":"sections/pipeline_samples/papers/prometheus/#installation","title":"Installation","text":"To reproduce the code below, one will need to install distilabel as it follows: pip install \"distilabel[vllm]>=1.1.0\"\n Alternatively, it's recommended to install Dao-AILab/flash-attention to benefit from Flash Attention 2 speed ups during inference via vllm . pip install flash-attn --no-build-isolation\n Note The installation notes above assume that you are using a VM with one GPU accelerator with at least the required VRAM to fit prometheus-eval/prometheus-7b-v2.0 in bfloat16 (28GB); but if you have enough VRAM to fit their 8x7B model in bfloat16 (~90GB) you can use prometheus-eval/prometheus-8x7b-v2.0 instead. "},{"location":"sections/pipeline_samples/papers/prometheus/#building-blocks","title":"Building blocks","text":" -
LoadDataFromHub : GeneratorStep to load a dataset from the Hugging Face Hub. -
PrometheusEval : Task that assesses the quality of a response for a given instruction using any of the Prometheus 2 models. vLLM : LLM that loads a model from the Hugging Face Hub via vllm-project/vllm. Note Since the Prometheus 2 models use a slightly different chat template than mistralai/Mistral-7B-Instruct-v0.2 , we need to set the chat_template parameter to [INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST] so as to properly format the input for Prometheus 2. -
(Optional) KeepColumns : Task that keeps only the specified columns in the dataset, used to remove the undesired columns. "},{"location":"sections/pipeline_samples/papers/prometheus/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to see how Prometheus 2 can be used via distilabel . from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import KeepColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import PrometheusEval\n\nif __name__ == \"__main__\":\n with Pipeline(name=\"prometheus\") as pipeline:\n load_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n repo_id=\"HuggingFaceH4/instruction-dataset\",\n split=\"test\",\n output_mappings={\"prompt\": \"instruction\", \"completion\": \"generation\"},\n )\n\n task = PrometheusEval(\n name=\"task\",\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\",\n reference=False,\n num_generations=1,\n group_generations=False,\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\"instruction\", \"generation\", \"feedback\", \"result\", \"model_name\"],\n )\n\n load_dataset >> task >> keep_columns\n Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n task.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 1024,\n \"temperature\": 0.7,\n },\n },\n },\n },\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"instruction-dataset-prometheus\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/papers/ultrafeedback/","title":"UltraFeedback","text":"UltraFeedback: Boosting Language Models with High-quality Feedback is a paper published by OpenBMB which proposes UltraFeedback , a large-scale, fine-grained, diverse preference dataset, used for training powerful reward models and critic models. UltraFeedback collects about 64k prompts from diverse resources (including UltraChat, ShareGPT, Evol-Instruct, TruthfulQA, FalseQA, and FLAN), then they use these prompts to query multiple LLMs (commercial models, Llama models ranging 7B to 70B, and non-Llama models) and generate four different responses for each prompt, resulting in a total of 256k samples i.e. the UltraFeedback will rate four responses on every OpenAI request. To collect high-quality preference and textual feedback, they design a fine-grained annotation instruction, which contains four different aspects, namely instruction-following, truthfulness, honesty and helpfulness (even though within the paper they also mention a fifth one named verbalized calibration). Finally, GPT-4 is used to generate the ratings for the generated responses to the given prompt using the previously mentioned aspects. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#replication","title":"Replication","text":"To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes. Also for testing purposes we will just show how to evaluate the generated responses for a given prompt using a new global aspect named overall-rating defined by Argilla, that computes the average of the four aspects, so as to reduce number of requests to be sent to OpenAI, but note that all the aspects are implemented within distilabel and can be used instead for a more faithful reproduction. Besides that we will generate three responses for each instruction using three LLMs selected from a pool of six: HuggingFaceH4/zephyr-7b-beta , argilla/notus-7b-v1 , google/gemma-1.1-7b-it , meta-llama/Meta-Llama-3-8B-Instruct , HuggingFaceH4/zephyr-7b-gemma-v0.1 and mlabonne/UltraMerge-7B . "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#installation","title":"Installation","text":"To replicate UltraFeedback one will need to install distilabel as it follows: pip install \"distilabel[argilla,openai,vllm]>=1.0.0\"\n And since we will be using vllm we will need to use a VM with at least 6 NVIDIA GPUs with at least 16GB of memory each to run the text generation, and set the OPENAI_API_KEY environment variable value. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#building-blocks","title":"Building blocks","text":" LoadDataFromHub : Generator Step to load a dataset from the Hugging Face Hub. sample_n_steps : Function to create a routing_batch_function that samples n downstream steps for each batch generated by the upstream step. This is the key to replicate the LLM pooling mechanism described in the paper. TextGeneration : Task to generate responses for a given instruction using an LLM. vLLM : LLM that loads a model from the Hugging Face Hub using vllm . GroupColumns : Task that combines multiple columns into a single one i.e. from string to list of strings. Useful when there are multiple parallel steps that are connected to the same node. UltraFeedback : Task that generates ratings for the responses of a given instruction using the UltraFeedback prompt. OpenAILLM : LLM that loads a model from OpenAI. KeepColumns : Task to keep the desired columns while removing the not needed ones, as well as defining the order for those. - (optional)
PreferenceToArgilla : Task to optionally push the generated dataset to Argilla to do some further analysis and human annotation. "},{"location":"sections/pipeline_samples/papers/ultrafeedback/#code","title":"Code","text":"As mentioned before, we will put the previously mentioned building blocks together to replicate UltraFeedback. from distilabel.models import OpenAILLM, vLLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import (\n GroupColumns,\n KeepColumns,\n LoadDataFromHub,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nsample_three_llms = sample_n_steps(n=3)\n\n\nwith Pipeline(name=\"ultrafeedback-pipeline\") as pipeline:\n load_hub_dataset = LoadDataFromHub(\n name=\"load_dataset\",\n output_mappings={\"prompt\": \"instruction\"},\n batch_size=2,\n )\n\n text_generation_with_notus = TextGeneration(\n name=\"text_generation_with_notus\",\n llm=vLLM(model=\"argilla/notus-7b-v1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_zephyr = TextGeneration(\n name=\"text_generation_with_zephyr\",\n llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_gemma = TextGeneration(\n name=\"text_generation_with_gemma\",\n llm=vLLM(model=\"google/gemma-1.1-7b-it\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_zephyr_gemma = TextGeneration(\n name=\"text_generation_with_zephyr_gemma\",\n llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_llama = TextGeneration(\n name=\"text_generation_with_llama\",\n llm=vLLM(model=\"meta-llama/Meta-Llama-3-8B-Instruct\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n text_generation_with_ultramerge = TextGeneration(\n name=\"text_generation_with_ultramerge\",\n llm=vLLM(model=\"mlabonne/UltraMerge-7B\"),\n input_batch_size=2,\n output_mappings={\"model_name\": \"generation_model\"},\n )\n\n combine_columns = GroupColumns(\n name=\"combine_columns\",\n columns=[\"generation\", \"generation_model\"],\n output_columns=[\"generations\", \"generation_models\"],\n input_batch_size=2\n )\n\n ultrafeedback = UltraFeedback(\n name=\"ultrafeedback_openai\",\n llm=OpenAILLM(model=\"gpt-4-turbo-2024-04-09\"),\n aspect=\"overall-rating\",\n output_mappings={\"model_name\": \"ultrafeedback_model\"},\n )\n\n keep_columns = KeepColumns(\n name=\"keep_columns\",\n columns=[\n \"instruction\",\n \"generations\",\n \"generation_models\",\n \"ratings\",\n \"rationales\",\n \"ultrafeedback_model\",\n ],\n )\n\n (\n load_hub_dataset\n >> sample_three_llms\n >> [\n text_generation_with_notus,\n text_generation_with_zephyr,\n text_generation_with_gemma,\n text_generation_with_llama,\n text_generation_with_zephyr_gemma,\n text_generation_with_ultramerge\n ]\n >> combine_columns\n >> ultrafeedback\n >> keep_columns\n )\n\n # Optional: Push the generated dataset to Argilla, but will need to `pip install argilla` first\n # push_to_argilla = PreferenceToArgilla(\n # name=\"push_to_argilla\",\n # api_url=\"<ARGILLA_API_URL>\",\n # api_key=\"<ARGILLA_API_KEY>\", # type: ignore\n # dataset_name=\"ultrafeedback\",\n # dataset_workspace=\"admin\",\n # num_generations=2,\n # )\n # keep_columns >> push_to_argilla\n Note As we're using a relative small dataset, we're setting a low batch_size and input_batch_size so we have more batches for the routing_batch_function i.e. we will have more variety on the LLMs used to generate the responses. When using a large dataset, it's recommended to use a larger batch_size and input_batch_size to benefit from the vLLM optimizations for larger batch sizes, which makes the pipeline execution faster. Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched. distiset = pipeline.run(\n parameters={\n load_hub_dataset.name: {\n \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n \"split\": \"test\",\n },\n text_generation_with_notus.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_zephyr.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_gemma.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_llama.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_zephyr_gemma.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n text_generation_with_ultramerge.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 512,\n \"temperature\": 0.7,\n }\n },\n },\n ultrafeedback.name: {\n \"llm\": {\n \"generation_kwargs\": {\n \"max_new_tokens\": 2048,\n \"temperature\": 0.7,\n }\n },\n },\n }\n)\n Finally, we can optionally push the generated dataset, named Distiset , to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub. distiset.push_to_hub(\n \"ultrafeedback-instruction-dataset\",\n private=True,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"sentence-transformers~=3.0\"\n Let's make the needed imports: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.steps import LoadDataFromHub\n\nfrom sentence_transformers import SentenceTransformer, CrossEncoder\nimport torch\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n Let's make the extra needed imports: import argilla as rg\n context = (\n\"\"\"\nThe text is a chunk from technical Python SDK documentation of Argilla.\nArgilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets.\nAlong with prose explanations, the text chunk may include code snippets and Python references.\n\"\"\"\n)\n llm = InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n tokenizer_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n)\n\nwith Pipeline(name=\"generate\") as pipeline:\n load_dataset = LoadDataFromHub(\n num_examples=15,\n output_mappings={\"chunks\": \"anchor\"},\n )\n generate_retrieval_pairs = GenerateSentencePair(\n name=\"generate_retrieval_pairs\",\n triplet=True,\n hard_negative=True,\n action=\"query\",\n llm=llm,\n input_batch_size=10,\n context=context,\n )\n generate_reranking_pairs = GenerateSentencePair(\n name=\"generate_reranking_pairs\",\n triplet=True,\n hard_negative=False, # to potentially generate non-relevant pairs\n action=\"semantically-similar\",\n llm=llm,\n input_batch_size=10,\n context=context,\n )\n\n load_dataset.connect(generate_retrieval_pairs, generate_reranking_pairs)\n Next, we can execute this using pipeline.run . We will provide some parameters to specific components within our pipeline. generation_kwargs = {\n \"llm\": {\n \"generation_kwargs\": {\n \"temperature\": 0.7,\n \"max_new_tokens\": 512,\n }\n }\n}\n\ndistiset = pipeline.run( \n parameters={\n load_dataset.name: {\n \"repo_id\": \"plaguss/argilla_sdk_docs_raw_unstructured\",\n \"split\": \"train\",\n },\n generate_retrieval_pairs.name: generation_kwargs,\n generate_reranking_pairs.name: generation_kwargs,\n },\n use_cache=False, # False for demo\n)\n Data generation can be a expensive, so it is recommended to store the data somewhere. For now, we will store it on the Hugging Face Hub, using our push_to_hub method. distiset.push_to_hub(\"[your-owner-name]/example-retrieval-reranking-dataset\")\n We have got 2 different leaf/end nodes, therefore we've got a distil configurations we can access, one for the retrieval data, and one for the reranking data. Looking at these initial examples, we can see they nicely capture the essence of the chunks column but we will need to evaluate the quality of the data a bit more before we can use it for fine-tuning. model_id = \"Snowflake/snowflake-arctic-embed-m\" # Hugging Face model ID\n\nmodel_retrieval = SentenceTransformer(\n model_id, device=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n)\n Next, we will encode the generated text pairs and compute the similarities. from sklearn.metrics.pairwise import cosine_similarity\n\ndef get_embeddings(texts):\n vectors = model_retrieval.encode(texts)\n return [vector.tolist() for vector in vectors]\n\n\ndef get_similarities(vector_batch_a, vector_batch_b):\n similarities = []\n for vector_a, vector_b in zip(vector_batch_a, vector_batch_b):\n similarity = cosine_similarity([vector_a], [vector_b])[0][0]\n similarities.append(similarity)\n return similarities\n\ndef format_data_retriever(batch):# -> Any:\n batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n batch[\"positive-vector\"] = get_embeddings(batch[\"positive\"])\n batch[\"negative-vector\"] = get_embeddings(batch[\"negative\"]) \n batch[\"similarity-positive-negative\"] = get_similarities(batch[\"positive-vector\"], batch[\"negative-vector\"])\n batch[\"similarity-anchor-positive\"] = get_similarities(batch[\"anchor-vector\"], batch[\"positive-vector\"])\n batch[\"similarity-anchor-negative\"] = get_similarities(batch[\"anchor-vector\"], batch[\"negative-vector\"])\n return batch\n\ndataset_generate_retrieval_pairs = distiset[\"generate_retrieval_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n model_id = \"sentence-transformers/all-MiniLM-L12-v2\"\n\nmodel = CrossEncoder(model_id)\n Next, we will compute the similarity for the generated text pairs using the reranker. On top of that, we will compute an anchor-vector to allow for doing semantic search. def format_data_retriever(batch):# -> Any:\n batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n batch[\"similarity-positive-negative\"] = model.predict(zip(batch[\"positive-vector\"], batch[\"negative-vector\"]))\n batch[\"similarity-anchor-positive\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"positive-vector\"]))\n batch[\"similarity-anchor-negative\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"negative-vector\"]))\n return batch\n\ndataset_generate_reranking_pairs = distiset[\"generate_reranking_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n And voila, we have our proxies for quality evaluation which we can use to filter out the best and worst examples. First, we need to define the setting for our Argilla dataset. We will create two different datasets, one for the retrieval data and one for the reranking data to ensure our annotators can focus on the task at hand. import argilla as rg\nfrom argilla._exceptions import ConflictError\n\napi_key = \"ohh so secret\"\napi_url = \"https://[your-owner-name]-[your-space-name].hf.space\"\n\nclient = rg.Argilla(api_url=api_url, api_key=api_key)\n\nsettings = rg.Settings(\n fields=[\n rg.TextField(\"anchor\")\n ],\n questions=[\n rg.TextQuestion(\"positive\"),\n rg.TextQuestion(\"negative\"),\n rg.LabelQuestion(\n name=\"is_positive_relevant\",\n title=\"Is the positive query relevant?\",\n labels=[\"yes\", \"no\"],\n ),\n rg.LabelQuestion(\n name=\"is_negative_irrelevant\",\n title=\"Is the negative query irrelevant?\",\n labels=[\"yes\", \"no\"],\n )\n ],\n metadata=[\n rg.TermsMetadataProperty(\"filename\"),\n rg.FloatMetadataProperty(\"similarity-positive-negative\"),\n rg.FloatMetadataProperty(\"similarity-anchor-positive\"),\n rg.FloatMetadataProperty(\"similarity-anchor-negative\"),\n ],\n vectors=[\n rg.VectorField(\"anchor-vector\", dimensions=model.get_sentence_embedding_dimension())\n ]\n)\nrg_datasets = []\nfor dataset_name in [\"generate_retrieval_pairs\", \"generate_reranking_pairs\"]:\n ds = rg.Dataset(\n name=dataset_name,\n settings=settings\n )\n try:\n ds.create()\n except ConflictError:\n ds = client.datasets(dataset_name)\n rg_datasets.append(ds)\n Now, we've got our dataset definitions setup in Argilla, we can upload our data to Argilla. ds_datasets = [dataset_generate_retrieval_pairs, dataset_generate_reranking_pairs]\n\nrecords = []\n\nfor rg_dataset, ds_dataset in zip(rg_datasets, ds_datasets):\n for idx, entry in enumerate(ds_dataset):\n records.append(\n rg.Record(\n id=idx,\n fields={\"anchor\": entry[\"anchor\"]},\n suggestions=[\n rg.Suggestion(\"positive\", value=entry[\"positive\"], agent=\"gpt-4o\", type=\"model\"),\n rg.Suggestion(\"negative\", value=entry[\"negative\"], agent=\"gpt-4o\", type=\"model\"),\n ],\n metadata={\n \"filename\": entry[\"filename\"],\n \"similarity-positive-negative\": entry[\"similarity-positive-negative\"],\n \"similarity-anchor-positive\": entry[\"similarity-anchor-positive\"],\n \"similarity-anchor-negative\": entry[\"similarity-anchor-negative\"]\n },\n vectors={\"anchor-vector\": entry[\"anchor-vector\"]}\n )\n )\n rg_dataset.records.log(records)\n Now, we can explore the UI and add a final human touch to get he most out of our dataset. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation-for-fine-tuning-custom-retrieval-and-reranking-models","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":" - Goal: Bootstrap, optimize and maintain your embedding models and rerankers through synthetic data generation and human feedback.
- Libraries: argilla, hf-inference-endpoints, sentence-transformers
- Components: LoadDataFromHub, GenerateSentencePair, InferenceEndpointsLLM
Note For a comprehensive overview on optimizing the retrieval performance in a RAG pipeline, check this guide in collaboration with ZenML, an open-source MLOps framework designed for building portable and production-ready machine learning pipelines. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#the-dataset","title":"The dataset","text":"Before starting any project, it is always important to look at your data. Our data is publicly available on the Hugging Face Hub so we can have a quick look through their dataset viewer within an embedded iFrame. As we can see, our dataset contains a column called chunks , which was obtained from the Argilla docs. Normally, you would need to download and chunk the data but we will not cover that in this tutorial. To read a full explanation for how this dataset was generated, please refer to How we leveraged distilabel to create an Argilla 2.0 Chatbot. Alternatively, we can load the entire dataset to disk with datasets.load_dataset . "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation","title":"Synthetic data generation","text":"The GenerateSentencePair component from distilabel can be used to generate training datasets for embeddings models. It is a pre-defined Task that given an anchor sentence generate data for a specific action . Supported actions are: \"paraphrase\", \"semantically-similar\", \"query\", \"answer\" . In our case the chunks column corresponds to the anchor . This means we will use query to generate potential queries for a fine-tuning a retrieval model and that we will use semantically-similar to generate texts that are similar to the intial anchor for fine-tuning a reranking model. We will triplet=True in order to generate both positive and negative examples, which should help the model generalize better during fine-tuning and we will set hard_negative=True to generate more challenging examples that are closer to the anchor and discussed topics. Lastly, we can seed the LLM with context to generate more relevant examples. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval","title":"Retrieval","text":"For retrieval, we will thus generate queries that are similar to the chunks column. We will use the query action to generate potential queries for a fine-tuning a retrieval model. generate_sentence_pair = GenerateSentencePair(\n triplet=True, \n hard_negative=True,\n action=\"query\",\n llm=llm,\n input_batch_size=10,\n context=context,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking","title":"Reranking","text":"For reranking, we will generate texts that are similar to the intial anchor. We will use the semantically-similar action to generate texts that are similar to the intial anchor for fine-tuning a reranking model. In this case, we set hard_negative=False to generate more diverse and potentially wrong examples, which can be used as negative examples for similarity fine-tuning because rerankers cannot be fine-tuned using triplets. generate_sentence_pair = GenerateSentencePair(\n triplet=True,\n hard_negative=False,\n action=\"semantically-similar\",\n llm=llm,\n input_batch_size=10,\n context=context,\n)\n "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#combined-pipeline","title":"Combined pipeline","text":"We will now use the GenerateSentencePair task to generate synthetic data for both retrieval and reranking models in a single pipeline. Note that, we map the chunks column to the anchor argument. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#data-quality-evaluation","title":"Data quality evaluation","text":"Data is never as clean as it can be and this also holds for synthetically generated data too, therefore, it is always good to spent some time and look at your data. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#feature-engineering","title":"Feature engineering","text":"In order to evaluate the quality of our data we will use features of the models that we intent to fine-tune as proxy for data quality. We can then use these features to filter out the best examples. In order to choose a good default model, we will use the Massive Text Embedding Benchmark (MTEB) Leaderboard. We want to optimize for size and speed, so we will set model size <100M and then filter for Retrieval and Reranking based on the highest average score, resulting in Snowflake/snowflake-arctic-embed-s and sentence-transformers/all-MiniLM-L12-v2 respectively. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_1","title":"Retrieval","text":"For retrieval, we will compute similarities for the current embeddings of anchor-positive , positive-negative and anchor-negative pairs. We assume that an overlap of these similarities will cause the model to have difficulties generalizing and therefore we can use these features to evaluate the quality of our data. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_1","title":"Reranking","text":"For reranking, we will compute the compute the relevance scores from an existing reranker model for anchor-positive , positive-negative and anchor-negative pais and make a similar assumption as for the retrieval model. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-argilla","title":"(Optional) Argilla","text":"To get the most out of you data and actually look at our data, we will use Argilla. If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space. To start exploring data, we first need to define an argilla.Dataset . We will create a basic datset with some input TextFields for the anchor and output TextQuestions for the positive and negative pairs. Additionally, we will use the file_name as MetaDataProperty . Lastly, we will be re-using the vectors obtained from our previous step to allow for semantic search and we will add te similarity scores for some basic filtering and sorting. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#fine-tuning","title":"Fine-tuning","text":"At last, we can fine-tune our models. We will use the sentence-transformers library to fine-tune our models. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_2","title":"Retrieval","text":"For retrieval, we have created a script that fine-tunes a model on our generated data the generated data based https://github.com/argilla-io/argilla-sdk-chatbot/blob/main/train_embedding.ipynb.You can also open it in Google Colab directly. "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_2","title":"Reranking","text":"For reranking, sentence-transformers provides a script that shows how to fine-tune a CrossEncoder models. Ad of now, there is some uncertainty over fine-tuning CrossEncoder models with triplets but you can still use the positive and anchor "},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#conclusions","title":"Conclusions","text":"In this tutorial, we present an end-to-end example of fine-tuning retrievers and rerankers for RAG. This serves as a good starting point for optimizing and maintaining your data and model but need to be adapted to your specific use case. We started with some seed data from the Argilla docs, generated synthetic data for retrieval and reranking models, evaluated the quality of the data, and showed how to fine-tune the models. We also used Argilla to get a human touch on the data. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/","title":"Clean an existing preference dataset","text":" - Goal: Clean an existing preference dataset by providing AI feedback on the quality of the data.
- Libraries: argilla, hf-inference-endpoints
- Components: LoadDataFromDicts, UltraFeedback, KeepColumns, PreferenceToArgilla, InferenceEndpointsLLM, GlobalStep
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.0\" \"torch~=2.0\"\n Let's make the required imports: import random\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n KeepColumns,\n LoadDataFromDicts,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import UltraFeedback\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n In this case, we will clean a preference dataset, so we will use the Intel/orca_dpo_pairs dataset from the Hugging Face Hub. dataset = load_dataset(\"Intel/orca_dpo_pairs\", split=\"train[:20]\")\n Next, we will shuffle the chosen and rejected columns to avoid any bias in the dataset. def shuffle_and_track(chosen, rejected):\n pair = [chosen, rejected]\n random.shuffle(pair)\n order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n return {\"generations\": pair, \"order\": order}\n\ndataset = dataset.map(lambda x: shuffle_and_track(x[\"chosen\"], x[\"rejected\"]))\n dataset = dataset.to_list()\n As a custom step You can also create a custom step in a separate module, import it and add it to the pipeline after loading the orca_dpo_pairs dataset using the LoadDataFromHub step. shuffle_step.pyfrom typing import TYPE_CHECKING, List\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n from distilabel.steps.typing import StepOutput\n\nimport random\n\nclass ShuffleStep(GlobalStep):\n @property\n def inputs(self):\n \"\"\"Returns List[str]: The inputs of the step.\"\"\"\n return [\"instruction\", \"chosen\", \"rejected\"]\n\n @property\n def outputs(self):\n \"\"\"Returns List[str]: The outputs of the step.\"\"\"\n return [\"instruction\", \"generations\", \"order\"]\n\n def process(self, inputs: StepInput):\n \"\"\"Returns StepOutput: The outputs of the step.\"\"\"\n outputs = []\n\n for input in inputs:\n chosen = input[\"chosen\"]\n rejected = input[\"rejected\"]\n pair = [chosen, rejected]\n random.shuffle(pair)\n order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n\n outputs.append({\"instruction\": input[\"instruction\"], \"generations\": pair, \"order\": order})\n\n yield outputs\n from shuffle_step import ShuffleStep\n To clean an existing preference dataset, we will need to define a Pipeline with all the necessary steps. However, a similar workflow can be used to clean a SFT dataset. Below, we will go over each step in detail. load_dataset = LoadDataFromDicts(\n data=dataset[:1],\n output_mappings={\"question\": \"instruction\"},\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nload_dataset.load()\nnext(load_dataset.process())\n \n([{'system': '',\n 'question': \"You will be given a definition of a task first, then some input of the task.\\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\\n\\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\\nOutput:\",\n 'chosen': '[\\n [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]',\n 'rejected': \" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n 'generations': [\" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n '[\\n [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]'],\n 'order': ['rejected', 'chosen']}],\n True) \n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n evaluate_responses.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'ratings': [5, 1],\n 'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n 'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] \n keep_columns = KeepColumns(\n columns=[\n \"instruction\",\n \"generations\",\n \"order\",\n \"ratings\",\n \"rationales\",\n \"model_name\",\n ],\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nkeep_columns.load()\nnext(\n keep_columns.process(\n [\n {\n \"system\": \"\",\n \"instruction\": \"What's the capital of Spain?\",\n \"chosen\": \"Madrid\",\n \"rejected\": \"Barcelona\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n \"order\": [\"chosen\", \"rejected\"],\n \"ratings\": [5, 1],\n \"rationales\": [\"\", \"\"],\n \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'order': ['chosen', 'rejected'],\n 'ratings': [5, 1],\n 'rationales': ['', ''],\n 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] \n to_argilla = PreferenceToArgilla(\n dataset_name=\"cleaned-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n)\n Below, you can see the full pipeline definition: with Pipeline(name=\"clean-dataset\") as pipeline:\n\n load_dataset = LoadDataFromDicts(\n data=dataset, output_mappings={\"question\": \"instruction\"}\n )\n\n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n )\n\n keep_columns = KeepColumns(\n columns=[\n \"instruction\",\n \"generations\",\n \"order\",\n \"ratings\",\n \"rationales\",\n \"model_name\",\n ]\n )\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"cleaned-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2,\n )\n\n load_dataset.connect(evaluate_responses)\n evaluate_responses.connect(keep_columns)\n keep_columns.connect(to_argilla)\n Let's now run the pipeline and clean our preference dataset. distiset = pipeline.run()\n Let's check it! If you have loaded the data to Argilla, you can start annotating in the Argilla UI. You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-cleaned-preference-dataset\")\n In this tutorial, we showcased the detailed steps to build a pipeline for cleaning a preference dataset using distilabel. However, you can customize this pipeline for your own use cases, such as cleaning an SFT dataset or adding custom steps. We used a preference dataset as our starting point and shuffled the data to avoid any bias. Next, we evaluated the responses using a model through the serverless Hugging Face Inference API, following the UltraFeedback standards. Finally, we kept the needed columns and used Argilla for further curation. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#clean-an-existing-preference-dataset","title":"Clean an existing preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#getting-started","title":"Getting Started","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#load-the-dataset","title":"Load the dataset","text":"We will use the dataset we just shuffled as source data. - Component:
LoadDataFromDicts - Input columns:
system , question , chosen , rejected , generations and order , the same keys as in the loaded list of dictionaries. - Output columns:
system , instruction , chosen , rejected , generations and order . We will use output_mappings to rename the columns. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"To evaluate the quality of the responses, we will use meta-llama/Meta-Llama-3.1-70B-Instruct , applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). For an SFT dataset, you can use PrometheusEval instead. - Component:
UltraFeedback task with LLMs using InferenceEndpointsLLM - Input columns:
instruction , generations - Output columns:
ratings , rationales , distilabel_metadata , model_name For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#keep-only-the-required-columns","title":"Keep only the required columns","text":"We will get rid of the unneeded columns. - Component:
KeepColumns - Input columns:
system , instruction , chosen , rejected , generations , ratings , rationales , distilabel_metadata and model_name - Output columns:
instruction , chosen , rejected , generations and order "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-further-data-curation","title":"(Optional) Further data curation","text":"You can use Argilla to further curate your data. - Component:
PreferenceToArgilla step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
instruction , generations , generation_models , ratings "},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/","title":"Generate a preference dataset","text":" - Goal: Generate a synthetic preference dataset for DPO/ORPO.
- Libraries: argilla, hf-inference-endpoints
- Components: LoadDataFromHub, TextGeneration, UltraFeedback, GroupColumns, FormatTextGenerationDPO, PreferenceToArgilla, InferenceEndpointsLLM
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.0\" \"torch~=2.0\"\n Let's make the required imports: from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n LoadDataFromHub,\n GroupColumns,\n FormatTextGenerationDPO,\n PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n To generate our preference dataset, we will need to define a Pipeline with all the necessary steps. Below, we will go over each step in detail. load_dataset = LoadDataFromHub(\n repo_id= \"argilla/10Kprompts-mini\",\n num_examples=1,\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n )\nload_dataset.load()\nnext(load_dataset.process())\n \n([{'instruction': 'How can I create an efficient and robust workflow that utilizes advanced automation techniques to extract targeted data, including customer information, from diverse PDF documents and effortlessly integrate it into a designated Google Sheet? Furthermore, I am interested in establishing a comprehensive and seamless system that promptly activates an SMS notification on my mobile device whenever a new PDF document is uploaded to the Google Sheet, ensuring real-time updates and enhanced accessibility.',\n 'topic': 'Software Development'}],\n True) \n generate_responses = [\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n ),\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n ),\n]\nfor task in generate_responses:\n task.load()\n print(next(task.process([{\"instruction\": \"Which are the top cities in Spain?\"}])))\n \n[{'instruction': 'Which are the top cities in Spain?', 'generation': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.', 'distilabel_metadata': {'raw_output_text_generation_0': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.'}, 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}]\n[{'instruction': 'Which are the top cities in Spain?', 'generation': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.', 'distilabel_metadata': {'raw_output_text_generation_0': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.'}, 'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1'}]\n \n group_responses = GroupColumns(\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nnext(\n group_responses.process(\n [\n {\n \"generation\": \"Madrid\",\n \"model_name\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n },\n ],\n [\n {\n \"generation\": \"Barcelona\",\n \"model_name\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n }\n ],\n )\n)\n \n[{'generations': ['Madrid', 'Barcelona'],\n 'model_names': ['meta-llama/Meta-Llama-3-8B-Instruct',\n 'mistralai/Mixtral-8x7B-Instruct-v0.1']}] \n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n evaluate_responses.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'ratings': [5, 1],\n 'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n 'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}] \n format_dpo = FormatTextGenerationDPO(pipeline=Pipeline(name=\"showcase-pipeline\"))\nformat_dpo.load()\nnext(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's the capital of Spain?\",\n \"generations\": [\"Madrid\", \"Barcelona\"],\n \"generation_models\": [\n \"Meta-Llama-3-8B-Instruct\",\n \"Mixtral-8x7B-Instruct-v0.1\",\n ],\n \"ratings\": [5, 1],\n }\n ]\n )\n)\n \n[{'instruction': \"What's the capital of Spain?\",\n 'generations': ['Madrid', 'Barcelona'],\n 'generation_models': ['Meta-Llama-3-8B-Instruct',\n 'Mixtral-8x7B-Instruct-v0.1'],\n 'ratings': [5, 1],\n 'prompt': \"What's the capital of Spain?\",\n 'prompt_id': '26174c953df26b3049484e4721102dca6b25d2de9e3aa22aa84f25ed1c798512',\n 'chosen': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n {'role': 'assistant', 'content': 'Madrid'}],\n 'chosen_model': 'Meta-Llama-3-8B-Instruct',\n 'chosen_rating': 5,\n 'rejected': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n {'role': 'assistant', 'content': 'Barcelona'}],\n 'rejected_model': 'Mixtral-8x7B-Instruct-v0.1',\n 'rejected_rating': 1}] \n - Or you can use Argilla to manually label the data and convert it to a preference dataset.
- Component:
PreferenceToArgilla step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
instruction , generations , generation_models , ratings to_argilla = PreferenceToArgilla(\n dataset_name=\"preference-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n)\n Below, you can see the full pipeline definition: with Pipeline(name=\"generate-dataset\") as pipeline:\n\n load_dataset = LoadDataFromHub(repo_id=\"argilla/10Kprompts-mini\")\n\n generate_responses = [\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n ),\n TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n ),\n ]\n\n group_responses = GroupColumns(\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"model_names\"],\n )\n\n evaluate_responses = UltraFeedback(\n aspect=\"overall-rating\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n )\n )\n\n format_dpo = FormatTextGenerationDPO()\n\n to_argilla = PreferenceToArgilla(\n dataset_name=\"preference-dataset\",\n dataset_workspace=\"argilla\",\n api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n api_key=\"[your-api-key]\",\n num_generations=2\n )\n\n for task in generate_responses:\n load_dataset.connect(task)\n task.connect(group_responses)\n group_responses.connect(evaluate_responses)\n evaluate_responses.connect(format_dpo, to_argilla)\n Let's now run the pipeline and generate the preference dataset. distiset = pipeline.run()\n Let's check the preference dataset! If you have loaded the data to Argilla, you can start annotating in the Argilla UI. You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-preference-dataset\")\n In this tutorial, we showcased the detailed steps to build a pipeline for generating a preference dataset using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub, or use them to train a model for DPO or ORPO. We used a dataset containing prompts to generate responses using two different models through the serverless Hugging Face Inference API. Next, we evaluated the responses using a third model, following the UltraFeedback standards. Finally, we converted the data to a preference dataset and used Argilla for further curation. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-a-preference-dataset","title":"Generate a preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#load-the-dataset","title":"Load the dataset","text":"We will use as source data the argilla/10Kprompts-mini dataset from the Hugging Face Hub. - Component:
LoadDataFromHub - Input columns:
instruction and topic , the same as in the loaded dataset - Output columns:
instruction and topic "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-responses","title":"Generate responses","text":"We need to generate the responses for the given instructions. We will use two different models available on the Hugging Face Hub through the Serverless Inference API: meta-llama/Meta-Llama-3-8B-Instruct and mistralai/Mixtral-8x7B-Instruct-v0.1 . We will also indicate the generation parameters for each model. - Component:
TextGeneration task with LLMs using InferenceEndpointsLLM - Input columns:
instruction - Output columns:
generation , distilabel_metadata , model_name for each model For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#group-the-responses","title":"Group the responses","text":"The task to evaluate the responses needs as input a list of generations. However, each model response was saved in the generation column of the subsets text_generation_0 and text_generation_1 . We will combine these two columns into a single column and the default subset. - Component:
GroupColumns - Input columns:
generation and model_name from text_generation_0 and text_generation_1 - Output columns:
generations and model_names "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"To build our preference dataset, we need to evaluate the responses generated by the models. We will use meta-llama/Meta-Llama-3-70B-Instruct for this, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). - Component:
UltraFeedback task with LLMs using InferenceEndpointsLLM - Input columns:
instruction , generations - Output columns:
ratings , rationales , distilabel_metadata , model_name For your use case and to improve the results, you can use any other LLM of your choice. "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#convert-to-a-preference-dataset","title":"Convert to a preference dataset","text":" - You can automatically convert it to a preference dataset with the
chosen and rejected columns. - Component:
FormatTextGenerationDPO step - Input columns:
instruction , generations , generation_models , ratings - Output columns:
prompt , prompt_id , chosen , chosen_model , chosen_rating , rejected , rejected_model , rejected_rating "},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/","title":"Generate synthetic text classification data","text":" - Goal: Generate synthetic text classification data to augment an imbalanced and limited dataset for training a topic classifier. In addition, generate new data for training a fact-based versus opinion-based classifier to add a new label.
- Libraries: argilla, hf-inference-endpoints, SetFit
- Components: LoadDataFromDicts, EmbeddingTaskGenerator, GenerateTextClassificationData
!pip install \"distilabel[hf-inference-endpoints]\"\n !pip install \"transformers~=4.40\" \"torch~=2.0\" \"setfit~=1.0\"\n Let's make the required imports: import random\nfrom collections import Counter\n\nfrom datasets import load_dataset, Dataset\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n GenerateTextClassificationData,\n)\nfrom setfit import SetFitModel, Trainer, sample_dataset\n You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook. import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n !pip install \"distilabel[argilla, hf-inference-endpoints]\"\n We will use the fancyzhx/ag_news dataset from the Hugging Face Hub as our original data source. To simulate a real-world scenario with imbalanced and limited data, we will load only 20 samples from this dataset. hf_dataset = load_dataset(\"fancyzhx/ag_news\", split=\"train[-20:]\")\n Now, we can retrieve the available labels in the dataset and examine the current data distribution. labels_topic = hf_dataset.features[\"label\"].names\nid2str = {i: labels_topic[i] for i in range(len(labels_topic))}\nprint(id2str)\nprint(Counter(hf_dataset[\"label\"]))\n \n{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}\nCounter({0: 12, 1: 6, 2: 2})\n \n As observed, the dataset is imbalanced, with most samples falling under the World category, while the Sci/Tech category is entirely missing. Moreover, there are insufficient samples to effectively train a topic classification model. We will also define the labels for the new classification task. labels_fact_opinion = [\"Fact-based\", \"Opinion-based\"]\n To generate the data we will use the GenerateTextClassificationData task. This task will use as input classification tasks and we can define the language, difficulty and clarity required for the generated data. task = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"college\",\n clarity=\"clear\",\n num_generations=1,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.4},\n ),\n input_batch_size=5,\n)\ntask.load()\nresult = next(\n task.process([{\"task\": \"Classify the news article as fact-based or opinion-based\"}])\n)\nprint(result[0][\"distilabel_metadata\"][\"raw_input_generate_text_classification_data_0\"])\n \n[{'role': 'user', 'content': 'You have been assigned a text classification task: Classify the news article as fact-based or opinion-based\\n\\nYour mission is to write one text classification example for this task in JSON format. The JSON object must contain the following keys:\\n - \"input_text\": a string, the input text specified by the classification task.\\n - \"label\": a string, the correct label of the input text.\\n - \"misleading_label\": a string, an incorrect label that is related to the task.\\n\\nPlease adhere to the following guidelines:\\n - The \"input_text\" should be diverse in expression.\\n - The \"misleading_label\" must be a valid label for the given task, but not as appropriate as the \"label\" for the \"input_text\".\\n - The values for all fields should be in English.\\n - Avoid including the values of the \"label\" and \"misleading_label\" fields in the \"input_text\", that would make the task too easy.\\n - The \"input_text\" is clear and requires college level education to comprehend.\\n\\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'}]\n \n For our use case, we only need to generate data for two tasks: a topic classification task and a fact versus opinion classification task. Therefore, we will define the tasks accordingly. As we will be using an smaller model for generation, we will select 2 random labels for each topic classification task and change the order for the fact versus opinion classification task ensuring more diversity in the generated data. task_templates = [\n \"Determine the news article as {}\",\n \"Classify news article as {}\",\n \"Identify the news article as {}\",\n \"Categorize the news article as {}\",\n \"Label the news article using {}\",\n \"Annotate the news article based on {}\",\n \"Determine the theme of a news article from {}\",\n \"Recognize the topic of the news article as {}\",\n]\n\nclassification_tasks = [\n {\"task\": action.format(\" or \".join(random.sample(labels_topic, 2)))}\n for action in task_templates for _ in range(4)\n] + [\n {\"task\": action.format(\" or \".join(random.sample(labels_fact_opinion, 2)))}\n for action in task_templates\n]\n Now, it's time to define and run the pipeline. As mentioned, we will load the written tasks and feed them into the GenerateTextClassificationData task. For our use case, we will be using Meta-Llama-3.1-8B-Instruct via the InferenceEndpointsLLM , with different degrees of difficulty and clarity. difficulties = [\"college\", \"high school\", \"PhD\"]\nclarity = [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n\nwith Pipeline(\"texcat-generation-pipeline\") as pipeline:\n\n tasks_generator = LoadDataFromDicts(data=classification_tasks)\n\n generate_data = []\n for difficulty in difficulties:\n for clarity_level in clarity:\n task = GenerateTextClassificationData(\n language=\"English\",\n difficulty=difficulty,\n clarity=clarity_level,\n num_generations=2,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n ),\n input_batch_size=5,\n )\n generate_data.append(task)\n\n for task in generate_data:\n tasks_generator.connect(task)\n Let's now run the pipeline and generate the synthetic data. distiset = pipeline.run()\n distiset[\"generate_text_classification_data_0\"][\"train\"][0]\n \n{'task': 'Determine the news article as Business or World',\n 'input_text': \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone's economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\n 'label': 'Business',\n 'misleading_label': 'World',\n 'distilabel_metadata': {'raw_output_generate_text_classification_data_0': '{\\n \"input_text\": \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone\\'s economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\\n \"label\": \"Business\",\\n \"misleading_label\": \"World\"\\n}'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct'} \n You can push the dataset to the Hub for sharing with the community and embed it to explore the data. distiset.push_to_hub(\"[your-owner-name]/example-texcat-generation-dataset\")\n By examining the distiset distribution, we can confirm that it includes at least the 8 required samples for each label to train our classification models with SetFit. all_labels = [\n entry[\"label\"]\n for dataset_name in distiset\n for entry in distiset[dataset_name][\"train\"]\n]\n\nCounter(all_labels)\n \nCounter({'Sci/Tech': 275,\n 'Business': 130,\n 'World': 86,\n 'Fact-based': 86,\n 'Sports': 64,\n 'Opinion-based': 54,\n None: 20,\n 'Opinion Based': 1,\n 'News/Opinion': 1,\n 'Science': 1,\n 'Environment': 1,\n 'Opinion': 1}) \n We will create two datasets with the required labels and data for our use cases. def extract_rows(distiset, labels):\n return [\n {\n \"text\": entry[\"input_text\"],\n \"label\": entry[\"label\"],\n \"id\": i\n }\n for dataset_name in distiset\n for i, entry in enumerate(distiset[dataset_name][\"train\"])\n if entry[\"label\"] in labels\n ]\n\ndata_topic = extract_rows(distiset, labels_topic)\ndata_fact_opinion = extract_rows(distiset, labels_fact_opinion)\n Get started in Argilla If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space. To get the most out of our data, we will use Argilla. First, we need to connect to the Argilla instance. import argilla as rg\n\n# Replace api_url with your url if using Docker\n# Replace api_key with your API key under \"My Settings\" in the UI\n# Uncomment the last line and set your HF_TOKEN if your space is private\nclient = rg.Argilla(\n api_url=\"https://[your-owner-name]-[your_space_name].hf.space\",\n api_key=\"[your-api-key]\",\n # headers={\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n)\n We will create a Dataset for each task, with an input TextField for the text classification text and a LabelQuestion to ensure the generated labels are correct. def create_texcat_dataset(dataset_name, labels):\n settings = rg.Settings(\n fields=[rg.TextField(\"text\")],\n questions=[\n rg.LabelQuestion(\n name=\"label\",\n title=\"Classify the texts according to the following labels\",\n labels=labels,\n ),\n ],\n )\n return rg.Dataset(name=dataset_name, settings=settings).create()\n\n\nrg_dataset_topic = create_texcat_dataset(\"topic-classification\", labels_topic)\nrg_dataset_fact_opinion = create_texcat_dataset(\n \"fact-opinion-classification\", labels_fact_opinion\n)\n Now, we can upload the generated data to Argilla and evaluate it. We will use the generated labels as suggestions. rg_dataset_topic.records.log(data_topic)\nrg_dataset_fact_opinion.records.log(data_fact_opinion)\n Now, we can start the annotation process. Just open the dataset in the Argilla UI and start annotating the records. If the suggestions are correct, you can just click on Submit . Otherwise, you can select the correct label. Note Check this how-to guide to know more about annotating in the UI. Once, you get the annotations, let's continue by retrieving the data from Argilla and format it as a dataset with the required data. rg_dataset_topic = client.datasets(\"topic-classification\")\nrg_dataset_fact_opinion = client.datasets(\"fact-opinion-classification\")\n status_filter = rg.Query(filter=rg.Filter((\"response.status\", \"==\", \"submitted\")))\n\nsubmitted_topic = rg_dataset_topic.records(status_filter).to_list(flatten=True)\nsubmitted_fact_opinion = rg_dataset_fact_opinion.records(status_filter).to_list(\n flatten=True\n)\n def format_submitted(submitted):\n return [\n {\n \"text\": r[\"text\"],\n \"label\": r[\"label.responses\"][0],\n \"id\": i,\n }\n for i, r in enumerate(submitted)\n ]\n\ndata_topic = format_submitted(submitted_topic)\ndata_fact_opinion = format_submitted(submitted_fact_opinion)\n In our case, we will fine-tune using SetFit. However, you can select the one that best fits your requirements. The next step will be to format the data to be compatible with SetFit. In the case of the topic classification, we will need to combine the synthetic data with the original data. hf_topic = hf_dataset.to_list()\nnum = len(data_topic)\n\ndata_topic.extend(\n [\n {\n \"text\": r[\"text\"],\n \"label\": id2str[r[\"label\"]],\n \"id\": num + i,\n }\n for i, r in enumerate(hf_topic)\n ]\n)\n If we check the data distribution now, we can see that we have enough samples for each label to train our models. labels = [record[\"label\"] for record in data_topic]\nCounter(labels)\n \nCounter({'Sci/Tech': 275, 'Business': 132, 'World': 98, 'Sports': 70}) \n labels = [record[\"label\"] for record in data_fact_opinion]\nCounter(labels)\n \nCounter({'Fact-based': 86, 'Opinion-based': 54}) \n Now, let's create our training and validation datasets. The training dataset will gather 8 samples by label. In this case, the validation datasets will contain the remaining samples not included in the training datasets. def sample_and_split(dataset, label_column, num_samples):\n train_dataset = sample_dataset(\n dataset, label_column=label_column, num_samples=num_samples\n )\n eval_dataset = dataset.filter(lambda x: x[\"id\"] not in set(train_dataset[\"id\"]))\n return train_dataset, eval_dataset\n\n\ndataset_topic_full = Dataset.from_list(data_topic)\ndataset_fact_opinion_full = Dataset.from_list(data_fact_opinion)\n\ntrain_dataset_topic, eval_dataset_topic = sample_and_split(\n dataset_topic_full, \"label\", 8\n)\ntrain_dataset_fact_opinion, eval_dataset_fact_opinion = sample_and_split(\n dataset_fact_opinion_full, \"label\", 8\n)\n Let's train our models for each task! We will use TaylorAI/bge-micro-v2, available in the Hugging Face Hub. You can check the MTEB leaderboard to select the best model for your use case. def train_model(model_name, dataset, eval_dataset):\n model = SetFitModel.from_pretrained(model_name)\n\n trainer = Trainer(\n model=model,\n train_dataset=dataset,\n )\n trainer.train()\n metrics = trainer.evaluate(eval_dataset)\n print(metrics)\n\n return model\n model_topic = train_model(\n model_name=\"TaylorAI/bge-micro-v2\",\n dataset=train_dataset_topic,\n eval_dataset=eval_dataset_topic,\n)\nmodel_topic.save_pretrained(\"topic_classification_model\")\nmodel_topic = SetFitModel.from_pretrained(\"topic_classification_model\")\n \n***** Running training *****\n Num unique pairs = 768\n Batch size = 16\n Num epochs = 1\n Total optimization steps = 48\n \n \n{'embedding_loss': 0.1873, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}\n \n \n***** Running evaluation *****\n \n \n{'train_runtime': 4.9767, 'train_samples_per_second': 154.318, 'train_steps_per_second': 9.645, 'epoch': 1.0}\n{'accuracy': 0.8333333333333334}\n \n model_fact_opinion = train_model(\n model_name=\"TaylorAI/bge-micro-v2\",\n dataset=train_dataset_fact_opinion,\n eval_dataset=eval_dataset_fact_opinion,\n)\nmodel_fact_opinion.save_pretrained(\"fact_opinion_classification_model\")\nmodel_fact_opinion = SetFitModel.from_pretrained(\"fact_opinion_classification_model\")\n \n***** Running training *****\n Num unique pairs = 144\n Batch size = 16\n Num epochs = 1\n Total optimization steps = 9\n \n \n{'embedding_loss': 0.2985, 'learning_rate': 2e-05, 'epoch': 0.11}\n \n \n***** Running evaluation *****\n \n \n{'train_runtime': 0.8327, 'train_samples_per_second': 172.931, 'train_steps_per_second': 10.808, 'epoch': 1.0}\n{'accuracy': 0.9090909090909091}\n \n Voil\u00e0! The models are now trained and ready to be used. You can start making predictions to check the model's performance and add the new label. Optionally, you can continue using distilabel to generate additional data or Argilla to verify the quality of the predictions. def predict(model, input, labels):\n model.labels = labels\n prediction = model.predict([input])\n return prediction[0]\n predict(\n model_topic, \"The new iPhone is expected to be released next month.\", labels_topic\n)\n \n'Sci/Tech' \n predict(\n model_fact_opinion,\n \"The new iPhone is expected to be released next month.\",\n labels_fact_opinion,\n)\n \n'Opinion-based' \n In this tutorial, we showcased the detailed steps to build a pipeline for generating text classification data using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub. We defined two text classification tasks\u2014a topic classification task and a fact versus opinion classification task\u2014and generated new data using various models via the serverless Hugging Face Inference API. Then, we curated the generated data with Argilla. Finally, we trained the models with SetFit using both the original and synthetic data. "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#generate-synthetic-text-classification-data","title":"Generate synthetic text classification data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#install-the-dependencies","title":"Install the dependencies","text":"To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command: "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide. Along with that, you will need to install Argilla as a distilabel extra. "},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#define-the-text-classification-task","title":"Define the text classification task","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-evaluate-with-argilla","title":"(Optional) Evaluate with Argilla","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#train-your-models","title":"Train your models","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#formatting-the-data","title":"Formatting the data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-actual-training","title":"The actual training","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#conclusions","title":"Conclusions","text":""},{"location":"components-gallery/","title":"Components Gallery","text":" -
Steps Explore all the available Step s that can be used for data manipulation. Steps -
Tasks Explore all the available Task s that can be used with an LLM to perform data generation, annotation, and more. Tasks -
LLMs Explore all the available LLM s integrated with distilabel . LLMs -
Embeddings Explore all the available Embeddings models integrated with distilabel . Embeddings "},{"location":"components-gallery/steps/","title":"Steps Gallery","text":"Category Overview The gallery page showcases the different types of components within distilabel . Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data. labelling Labelling steps are used to label the data. -
PreferenceToArgilla Creates a preference dataset in Argilla. PreferenceToArgilla -
TextGenerationToArgilla Creates a text generation dataset in Argilla. TextGenerationToArgilla -
CombineColumns CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. CombineColumns -
PushToHub Push data to a Hugging Face Hub dataset. PushToHub -
LoadDataFromDicts Loads a dataset from a list of dictionaries. LoadDataFromDicts -
DataSampler Step to sample from a dataset. DataSampler -
LoadDataFromHub Loads a dataset from the Hugging Face Hub. LoadDataFromHub -
LoadDataFromFileSystem Loads a dataset from a file in your filesystem. LoadDataFromFileSystem -
LoadDataFromDisk Load a dataset that was previously saved to disk. LoadDataFromDisk -
PrepareExamples Helper step to create examples from query and answers pairs used as Few Shots in APIGen. PrepareExamples -
ConversationTemplate Generate a conversation template from an instruction and a response. ConversationTemplate -
FormatTextGenerationDPO Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO -
FormatChatGenerationDPO Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO -
FormatTextGenerationSFT Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT -
FormatChatGenerationSFT Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT -
DeitaFiltering Filter dataset rows using DEITA filtering strategy. DeitaFiltering -
EmbeddingDedup Deduplicates text using embeddings. EmbeddingDedup -
APIGenExecutionChecker Executes the generated function calls. APIGenExecutionChecker -
MinHashDedup Deduplicates text using MinHash and MinHashLSH . MinHashDedup -
CombineOutputs Combine the outputs of several upstream steps. CombineOutputs -
ExpandColumns Expand columns that contain lists into multiple rows. ExpandColumns -
GroupColumns Combines columns from a list of StepInput . GroupColumns -
KeepColumns Keeps selected columns in the dataset. KeepColumns -
MergeColumns Merge columns from a row. MergeColumns -
DBSCAN DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core DBSCAN -
UMAP UMAP is a general purpose manifold learning and dimension reduction algorithm. UMAP -
FaissNearestNeighbour Create a faiss index to get the nearest neighbours. FaissNearestNeighbour -
EmbeddingGeneration Generate embeddings using an Embeddings model. EmbeddingGeneration -
RewardModelScore Assign a score to a response using a Reward Model. RewardModelScore -
FormatPRM Helper step to transform the data into the format expected by the PRM model. FormatPRM -
TruncateTextColumn Truncate a row using a tokenizer or the number of characters. TruncateTextColumn "},{"location":"components-gallery/steps/preferencetoargilla/","title":"PreferenceToArgilla","text":"Creates a preference dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations. "},{"location":"components-gallery/steps/preferencetoargilla/#note","title":"Note","text":"This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations , as the ratings and rationales are optional. "},{"location":"components-gallery/steps/preferencetoargilla/#attributes","title":"Attributes","text":" -
num_generations: The number of generations to include in the dataset. -
dataset_name: The name of the dataset in Argilla. -
dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. -
api_url: The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. -
api_key: The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. "},{"location":"components-gallery/steps/preferencetoargilla/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/steps/preferencetoargilla/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generations]\n ICOL2[ratings]\n ICOL3[rationales]\n end\n end\n\n subgraph PreferenceToArgilla\n StepInput[Input Columns: instruction, generations, ratings, rationales]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n "},{"location":"components-gallery/steps/preferencetoargilla/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction that was used to generate the completion. -
generations (List[str] ): The completion that was generated based on the input instruction. -
ratings (List[str] , optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla. -
rationales (List[str] , optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla. "},{"location":"components-gallery/steps/preferencetoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/preferencetoargilla/#push-a-preference-dataset-to-an-argilla-instance","title":"Push a preference dataset to an Argilla instance","text":"from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n "},{"location":"components-gallery/steps/preferencetoargilla/#it-can-also-include-ratings-and-rationales","title":"It can also include ratings and rationales","text":"result = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generations\": [\"first_generation\", \"second_generation\"],\n \"ratings\": [\"4\", \"5\"],\n \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n }\n ],\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'instruction',\n# 'generations': ['first_generation', 'second_generation'],\n# 'ratings': ['4', '5'],\n# 'rationales': ['rationale for 4', 'rationale for 5']\n# }\n# ]\n "},{"location":"components-gallery/steps/textgenerationtoargilla/","title":"TextGenerationToArgilla","text":"Creates a text generation dataset in Argilla. Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d). "},{"location":"components-gallery/steps/textgenerationtoargilla/#note","title":"Note","text":"This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns. "},{"location":"components-gallery/steps/textgenerationtoargilla/#attributes","title":"Attributes","text":" -
dataset_name: The name of the dataset in Argilla. -
dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None , which means it will be created in the default workspace. -
api_url: The URL of the Argilla API. Defaults to None , which means it will be read from the ARGILLA_API_URL environment variable. -
api_key: The API key to authenticate with Argilla. Defaults to None , which means it will be read from the ARGILLA_API_KEY environment variable. "},{"location":"components-gallery/steps/textgenerationtoargilla/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n end\n end\n\n subgraph TextGenerationToArgilla\n StepInput[Input Columns: instruction, generation]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n "},{"location":"components-gallery/steps/textgenerationtoargilla/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#push-a-text-generation-dataset-to-an-argilla-instance","title":"Push a text generation dataset to an Argilla instance","text":"from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n num_generations=2,\n api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n api_key=\"api.key\",\n dataset_name=\"argilla_dataset\",\n dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n to_argilla.process(\n [\n {\n \"instruction\": \"instruction\",\n \"generation\": \"generation\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n "},{"location":"components-gallery/steps/combinecolumns/","title":"CombineColumns","text":"CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead. "},{"location":"components-gallery/steps/combinecolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n end\n\n subgraph CombineColumns\n end\n\n "},{"location":"components-gallery/steps/pushtohub/","title":"PushToHub","text":"Push data to a Hugging Face Hub dataset. A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub. "},{"location":"components-gallery/steps/pushtohub/#attributes","title":"Attributes","text":" -
repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded. -
split: The split of the dataset that will be pushed. Defaults to \"train\" . -
private: Whether the dataset to be pushed should be private or not. Defaults to False . -
token: The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN . If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None . "},{"location":"components-gallery/steps/pushtohub/#runtime-parameters","title":"Runtime Parameters","text":" -
repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded. -
split: The split of the dataset that will be pushed. -
private: Whether the dataset to be pushed should be private or not. -
token: The token that will be used to authenticate in the Hub. "},{"location":"components-gallery/steps/pushtohub/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n end\n\n subgraph PushToHub\n StepInput[Input Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n "},{"location":"components-gallery/steps/pushtohub/#inputs","title":"Inputs","text":" - dynamic (
all ): all columns from the input will be used to create the dataset. "},{"location":"components-gallery/steps/pushtohub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/pushtohub/#push-batches-of-your-dataset-to-the-hugging-face-hub-repository","title":"Push batches of your dataset to the Hugging Face Hub repository","text":"from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n push.process(\n [\n {\n \"instruction\": \"instruction \",\n \"generation\": \"generation\"\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n "},{"location":"components-gallery/steps/loaddatafromdicts/","title":"LoadDataFromDicts","text":"Loads a dataset from a list of dictionaries. GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches. "},{"location":"components-gallery/steps/loaddatafromdicts/#attributes","title":"Attributes","text":" - data: The list of dictionaries to load the data from.
"},{"location":"components-gallery/steps/loaddatafromdicts/#runtime-parameters","title":"Runtime Parameters","text":" - batch_size: The batch size to use when processing the data.
"},{"location":"components-gallery/steps/loaddatafromdicts/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromDicts\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromdicts/#outputs","title":"Outputs","text":" - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/loaddatafromdicts/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdicts/#load-data-from-a-list-of-dictionaries","title":"Load data from a list of dictionaries","text":"from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n "},{"location":"components-gallery/steps/datasampler/","title":"DataSampler","text":"Step to sample from a dataset. GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples. "},{"location":"components-gallery/steps/datasampler/#attributes","title":"Attributes","text":" -
data: The list of dictionaries to sample from. -
size: Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2. -
samples: Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100. "},{"location":"components-gallery/steps/datasampler/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph DataSampler\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/datasampler/#outputs","title":"Outputs","text":" - dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/datasampler/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/datasampler/#sample-data-from-a-list-of-dictionaries","title":"Sample data from a list of dictionaries","text":"from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n samples=10,\n size=2,\n batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n "},{"location":"components-gallery/steps/datasampler/#pipeline-with-a-loader-and-a-sampler-combined-in-a-single-stream","title":"Pipeline with a loader and a sampler combined in a single stream","text":"from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n .shuffle(seed=42)\n .select(range(500))\n .to_list()\n)\ndata = [\n {\n \"func_name\": \"final_velocity\",\n \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n },\n {\n \"func_name\": \"permutation_count\",\n \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n },\n {\n \"func_name\": \"getdivision\",\n \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n loader_seeds = LoadDataFromDicts(data=data)\n sampler = DataSampler(\n data=ds,\n size=2,\n samples=len(data),\n batch_size=8,\n )\n prep_examples = PrepareExamples()\n\n sampler >> prep_examples\n (\n [loader_seeds, prep_examples]\n >> combine_steps\n )\n# Now we have a single stream of data with the loader and the sampler data\n "},{"location":"components-gallery/steps/loaddatafromhub/","title":"LoadDataFromHub","text":"Loads a dataset from the Hugging Face Hub. GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library. "},{"location":"components-gallery/steps/loaddatafromhub/#attributes","title":"Attributes","text":" -
repo_id: The Hugging Face Hub repository ID of the dataset to load. -
split: The split of the dataset to load. -
config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. "},{"location":"components-gallery/steps/loaddatafromhub/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
repo_id: The Hugging Face Hub repository ID of the dataset to load. -
split: The split of the dataset to load. Defaults to 'train'. -
config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations. -
revision: The revision of the dataset to load. Defaults to the latest revision. -
streaming: Whether to load the dataset in streaming mode or not. Defaults to False . -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . "},{"location":"components-gallery/steps/loaddatafromhub/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromHub\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromhub/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromhub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromhub/#load-data-from-a-dataset-in-hugging-face-hub","title":"Load data from a dataset in Hugging Face Hub","text":"from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n split=\"test\",\n batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/","title":"LoadDataFromFileSystem","text":"Loads a dataset from a file in your filesystem. GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#attributes","title":"Attributes","text":" -
data_files: The path to the file, or directory containing the files that conform the dataset. -
split: The split of the dataset to load (typically will be train , test or validation ). "},{"location":"components-gallery/steps/loaddatafromfilesystem/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
data_files: The path to the file, or directory containing the files that conform the dataset. -
split: The split of the dataset to load. Defaults to 'train'. -
streaming: Whether to load the dataset in streaming mode or not. Defaults to False . -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . -
filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromFileSystem\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromfilesystem/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-hugging-face-dataset-in-your-file-system","title":"Load data from a Hugging Face dataset in your file system","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#specify-a-filetype-if-the-file-extension-is-not-expected","title":"Specify a filetype if the file extension is not expected","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-file-in-your-cloud-provider","title":"Load data from a file in your cloud provider","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-passing-a-glob-pattern","title":"Load data passing a glob pattern","text":"from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n data_files=\"path/to/dataset/*.jsonl\",\n streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromdisk/","title":"LoadDataFromDisk","text":"Load a dataset that was previously saved to disk. If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class. "},{"location":"components-gallery/steps/loaddatafromdisk/#attributes","title":"Attributes","text":" -
dataset_path: The path to the dataset or distiset. -
split: The split of the dataset to load (typically will be train , test or validation ). -
config: The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. "},{"location":"components-gallery/steps/loaddatafromdisk/#runtime-parameters","title":"Runtime Parameters","text":" -
batch_size: The batch size to use when processing the data. -
dataset_path: The path to the dataset or distiset. -
is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False. -
split: The split of the dataset to load. Defaults to 'train'. -
config: The configuration of the dataset to load. Defaults to default , if there are multiple configurations in the dataset this must be suplied or an error is raised. -
num_examples: The number of examples to load from the dataset. By default will load all examples. -
storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None . "},{"location":"components-gallery/steps/loaddatafromdisk/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph LoadDataFromDisk\n StepOutput[Output Columns: dynamic]\n end\n\n StepOutput --> OCOL0\n "},{"location":"components-gallery/steps/loaddatafromdisk/#outputs","title":"Outputs","text":" - dynamic (
all ): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub. "},{"location":"components-gallery/steps/loaddatafromdisk/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset","title":"Load data from a Hugging Face Dataset","text":"from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-distilabel-distiset","title":"Load data from a distilabel Distiset","text":"from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n dataset_path=\"path/to/dataset\",\n is_distiset=True,\n config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n "},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset-or-distiset-in-your-cloud-provider","title":"Load data from a Hugging Face Dataset or Distiset in your cloud provider","text":"from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n dataset_path=\"gcs://path/to/dataset\",\n storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n "},{"location":"components-gallery/steps/prepareexamples/","title":"PrepareExamples","text":"Helper step to create examples from query and answers pairs used as Few Shots in APIGen. "},{"location":"components-gallery/steps/prepareexamples/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[query]\n ICOL1[answers]\n end\n subgraph New columns\n OCOL0[examples]\n end\n end\n\n subgraph PrepareExamples\n StepInput[Input Columns: query, answers]\n StepOutput[Output Columns: examples]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/prepareexamples/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/prepareexamples/#outputs","title":"Outputs","text":" - examples (
str ): The formatted examples. "},{"location":"components-gallery/steps/prepareexamples/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/prepareexamples/#generate-examples-for-apigen","title":"Generate examples for APIGen","text":"from distilabel.steps.tasks.apigen.utils import PrepareExamples\n\nprepare_examples = PrepareExamples()\nresult = next(prepare_examples.process(\n [\n {\n \"query\": ['I need the area of circles with radius 2.5, 5, and 7.5 inches, please.', 'Can you provide the current locations of buses and trolleys on route 12?'],\n \"answers\": ['[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]', '[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]']\n }\n ]\n)\n# result\n# [{'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}, {'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}]\n "},{"location":"components-gallery/steps/conversationtemplate/","title":"ConversationTemplate","text":"Generate a conversation template from an instruction and a response. "},{"location":"components-gallery/steps/conversationtemplate/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n end\n subgraph New columns\n OCOL0[conversation]\n end\n end\n\n subgraph ConversationTemplate\n StepInput[Input Columns: instruction, response]\n StepOutput[Output Columns: conversation]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/conversationtemplate/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/conversationtemplate/#outputs","title":"Outputs","text":" - conversation (
ChatType ): The conversation template. "},{"location":"components-gallery/steps/conversationtemplate/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/conversationtemplate/#create-a-conversation-from-an-instruction-and-a-response","title":"Create a conversation from an instruction and a response","text":"from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n conv_template.process(\n [\n {\n \"instruction\": \"Hello\",\n \"response\": \"Hi\",\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n "},{"location":"components-gallery/steps/formattextgenerationdpo/","title":"FormatTextGenerationDPO","text":"Format the output of your LLMs for Direct Preference Optimization (DPO). FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings , so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook . "},{"location":"components-gallery/steps/formattextgenerationdpo/#note","title":"Note","text":"The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. "},{"location":"components-gallery/steps/formattextgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generations]\n ICOL3[generation_models]\n ICOL4[ratings]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[chosen]\n OCOL3[chosen_model]\n OCOL4[chosen_rating]\n OCOL5[rejected]\n OCOL6[rejected_model]\n OCOL7[rejected_rating]\n end\n end\n\n subgraph FormatTextGenerationDPO\n StepInput[Input Columns: system_prompt, instruction, generations, generation_models, ratings]\n StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n ICOL4 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepOutput --> OCOL5\n StepOutput --> OCOL6\n StepOutput --> OCOL7\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formattextgenerationdpo/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generations , if available. -
instruction (str ): The instruction used to generate the generations with the LLM . -
generations (List[str] ): The generations produced by the LLM . -
generation_models (List[str] , optional): The model names used to generate the generations , only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. -
ratings (List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . "},{"location":"components-gallery/steps/formattextgenerationdpo/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generations with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
chosen (List[Dict[str, str]] ): The chosen generation based on the ratings . -
chosen_model (str , optional): The model name used to generate the chosen generation, if the generation_models are available. -
chosen_rating (float ): The rating of the chosen generation. -
rejected (List[Dict[str, str]] ): The rejected generation based on the ratings . -
rejected_model (str , optional): The model name used to generate the rejected generation, if the generation_models are available. -
rejected_rating (float ): The rating of the rejected generation. "},{"location":"components-gallery/steps/formattextgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# { 'instruction': \"What's 2+2?\",\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n "},{"location":"components-gallery/steps/formatchatgenerationdpo/","title":"FormatChatGenerationDPO","text":"Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO). FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook ., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings . "},{"location":"components-gallery/steps/formatchatgenerationdpo/#note","title":"Note","text":"The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations. "},{"location":"components-gallery/steps/formatchatgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[messages]\n ICOL1[generations]\n ICOL2[generation_models]\n ICOL3[ratings]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[chosen]\n OCOL3[chosen_model]\n OCOL4[chosen_rating]\n OCOL5[rejected]\n OCOL6[rejected_model]\n OCOL7[rejected_rating]\n end\n end\n\n subgraph FormatChatGenerationDPO\n StepInput[Input Columns: messages, generations, generation_models, ratings]\n StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepOutput --> OCOL5\n StepOutput --> OCOL6\n StepOutput --> OCOL7\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatchatgenerationdpo/#inputs","title":"Inputs","text":" -
messages (List[Dict[str, str]] ): The conversation messages. -
generations (List[str] ): The generations produced by the LLM . -
generation_models (List[str] , optional): The model names used to generate the generations , only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored. -
ratings (List[float] ): The ratings for each of the generations , produced by a preference task such as UltraFeedback . "},{"location":"components-gallery/steps/formatchatgenerationdpo/#outputs","title":"Outputs","text":" -
prompt (str ): The user message used to generate the generations with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
chosen (List[Dict[str, str]] ): The chosen generation based on the ratings . -
chosen_model (str , optional): The model name used to generate the chosen generation, if the generation_models are available. -
chosen_rating (float ): The rating of the chosen generation. -
rejected (List[Dict[str, str]] ): The rejected generation based on the ratings . -
rejected_model (str , optional): The model name used to generate the rejected generation, if the generation_models are available. -
rejected_rating (float ): The rating of the rejected generation. "},{"location":"components-gallery/steps/formatchatgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n format_dpo.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generations\": [\"4\", \"5\", \"6\"],\n \"ratings\": [1, 0, -1],\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n# 'generations': ['4', '5', '6'],\n# 'ratings': [1, 0, -1],\n# 'prompt': \"What's 2+2?\",\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'chosen_rating': 1,\n# 'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n# 'rejected_rating': -1\n# }\n# ]\n "},{"location":"components-gallery/steps/formattextgenerationsft/","title":"FormatTextGenerationSFT","text":"Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT). FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. "},{"location":"components-gallery/steps/formattextgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generation]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[messages]\n end\n end\n\n subgraph FormatTextGenerationSFT\n StepInput[Input Columns: system_prompt, instruction, generation]\n StepOutput[Output Columns: prompt, prompt_id, messages]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formattextgenerationsft/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generation , if available. -
instruction (str ): The instruction used to generate the generation with the LLM . -
generation (str ): The generation produced by the LLM . "},{"location":"components-gallery/steps/formattextgenerationsft/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generation with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
messages (List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. "},{"location":"components-gallery/steps/formattextgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationsft/#format-your-dataset-for-sft-fine-tuning","title":"Format your dataset for SFT fine tuning","text":"from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"instruction\": \"What's 2+2?\",\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'instruction': 'What's 2+2?',\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n# }\n# ]\n "},{"location":"components-gallery/steps/formatchatgenerationsft/","title":"FormatChatGenerationSFT","text":"Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT). FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook . The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation. "},{"location":"components-gallery/steps/formatchatgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n ICOL1[instruction]\n ICOL2[generation]\n end\n subgraph New columns\n OCOL0[prompt]\n OCOL1[prompt_id]\n OCOL2[messages]\n end\n end\n\n subgraph FormatChatGenerationSFT\n StepInput[Input Columns: system_prompt, instruction, generation]\n StepOutput[Output Columns: prompt, prompt_id, messages]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatchatgenerationsft/#inputs","title":"Inputs","text":" -
system_prompt (str , optional): The system prompt used within the LLM to generate the generation , if available. -
instruction (str ): The instruction used to generate the generation with the LLM . -
generation (str ): The generation produced by the LLM . "},{"location":"components-gallery/steps/formatchatgenerationsft/#outputs","title":"Outputs","text":" -
prompt (str ): The instruction used to generate the generation with the LLM . -
prompt_id (str ): The SHA256 hash of the prompt . -
messages (List[Dict[str, str]] ): The chat-like conversation with the instruction as the user message and the generation as the assistant message. "},{"location":"components-gallery/steps/formatchatgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationsft/#format-your-dataset-for-sft","title":"Format your dataset for SFT","text":"from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n format_sft.process(\n [\n {\n \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n \"generation\": \"4\"\n }\n ]\n )\n)\n# >>> result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n# 'generation': '4',\n# 'prompt': 'What's 2+2?',\n# 'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n# }\n# ]\n "},{"location":"components-gallery/steps/deitafiltering/","title":"DeitaFiltering","text":"Filter dataset rows using DEITA filtering strategy. Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/steps/deitafiltering/#attributes","title":"Attributes","text":" -
data_budget: The desired size of the dataset after filtering. -
diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9 . -
normalize_embeddings: Whether to normalize the embeddings before computing the cosine distance. Defaults to True . "},{"location":"components-gallery/steps/deitafiltering/#runtime-parameters","title":"Runtime Parameters","text":" -
data_budget: The desired size of the dataset after filtering. -
diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. "},{"location":"components-gallery/steps/deitafiltering/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[evol_instruction_score]\n ICOL1[evol_response_score]\n ICOL2[embedding]\n end\n subgraph New columns\n OCOL0[deita_score]\n OCOL1[deita_score_computed_with]\n OCOL2[nearest_neighbor_distance]\n end\n end\n\n subgraph DeitaFiltering\n StepInput[Input Columns: evol_instruction_score, evol_response_score, embedding]\n StepOutput[Output Columns: deita_score, deita_score_computed_with, nearest_neighbor_distance]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/deitafiltering/#inputs","title":"Inputs","text":" -
evol_instruction_score (float ): The score of the instruction generated by ComplexityScorer step. -
evol_response_score (float ): The score of the response generated by QualityScorer step. -
embedding (List[float] ): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step. "},{"location":"components-gallery/steps/deitafiltering/#outputs","title":"Outputs","text":" -
deita_score (float ): The DEITA score for the instruction-response pair. -
deita_score_computed_with (List[str] ): The scores used to compute the DEITA score. -
nearest_neighbor_distance (float ): The cosine distance between the embeddings of the instruction-response pair. "},{"location":"components-gallery/steps/deitafiltering/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/deitafiltering/#filter-the-dataset-based-on-the-deita-score-and-the-cosine-distance-between-the-embeddings","title":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings","text":"from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n deita_filtering.process(\n [\n {\n \"evol_instruction_score\": 0.5,\n \"evol_response_score\": 0.5,\n \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n },\n {\n \"evol_instruction_score\": 0.6,\n \"evol_response_score\": 0.6,\n \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n },\n {\n \"evol_instruction_score\": 0.7,\n \"evol_response_score\": 0.7,\n \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n },\n ],\n )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n "},{"location":"components-gallery/steps/deitafiltering/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/steps/embeddingdedup/","title":"EmbeddingDedup","text":"Deduplicates text using embeddings. EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour ) using the nn_indices and nn_scores , determine the texts that are duplicate. "},{"location":"components-gallery/steps/embeddingdedup/#attributes","title":"Attributes","text":" - threshold: the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of
0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9 . Runtime Parameters: - threshold : the threshold to consider 2 examples as duplicates. "},{"location":"components-gallery/steps/embeddingdedup/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[nn_indices]\n ICOL1[nn_scores]\n end\n subgraph New columns\n OCOL0[keep_row_after_embedding_filtering]\n end\n end\n\n subgraph EmbeddingDedup\n StepInput[Input Columns: nn_indices, nn_scores]\n StepOutput[Output Columns: keep_row_after_embedding_filtering]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/embeddingdedup/#inputs","title":"Inputs","text":" -
nn_indices (List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. -
nn_scores (List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. "},{"location":"components-gallery/steps/embeddingdedup/#outputs","title":"Outputs","text":" - keep_row_after_embedding_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. "},{"location":"components-gallery/steps/embeddingdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddingdedup/#deduplicate-a-list-of-texts-using-embedding-information","title":"Deduplicate a list of texts using embedding information","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n data = LoadDataFromDicts(\n data=[\n {\n \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n \"embedding\": [\n 0.018477669046149742,\n -0.03748236608841726,\n 0.001919870620352492,\n 0.024918478063770535,\n 0.02348063521315178,\n 0.0038251285566308375,\n -0.01723884983037716,\n 0.02881971942372201,\n ],\n \"nn_indices\": [0, 1],\n \"nn_scores\": [\n 0.9164746999740601,\n 0.782106876373291,\n ],\n },\n {\n \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n \"embedding\": [\n -0.0023464179614082125,\n -0.07325472251663565,\n -0.06058678419516501,\n -0.02100326928586996,\n -0.013462744792362657,\n 0.027368447064244242,\n -0.003916070100455717,\n 0.01243614518480423,\n ],\n \"nn_indices\": [0, 2],\n \"nn_scores\": [\n 0.7552462220191956,\n 0.7261884808540344,\n ],\n },\n {\n \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n \"embedding\": [\n -0.01630817942328242,\n -0.023760151552345232,\n -0.014249650090627883,\n -0.005713686451446624,\n -0.016033059279131567,\n 0.0071440908501058786,\n -0.05691099643425161,\n 0.01597412704817784,\n ],\n \"nn_indices\": [1, 2],\n \"nn_scores\": [\n 0.8107735514640808,\n 0.7172299027442932,\n ],\n },\n ],\n batch_size=batch_size,\n )\n # In general you should do something like this before the deduplication step, to obtain the\n # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n # no need for it.\n # nn = FaissNearestNeighbour(\n # k=30,\n # metric_type=faiss.METRIC_INNER_PRODUCT,\n # search_batch_size=50,\n # train_size=len(dataset), # The number of embeddings to use for training\n # string_factory=\"IVF300_HNSW32,Flat\" # To use an index (optional, maybe required for big datasets)\n # )\n # Read more about the `string_factory` here:\n # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n embedding_dedup = EmbeddingDedup(\n threshold=0.8,\n input_batch_size=batch_size,\n )\n\n data >> embedding_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n "},{"location":"components-gallery/steps/apigenexecutionchecker/","title":"APIGenExecutionChecker","text":"Executes the generated function calls. This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath , which is a string pointing to a python .py file with functions). "},{"location":"components-gallery/steps/apigenexecutionchecker/#attributes","title":"Attributes","text":" -
libpath: The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename. -
check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True. "},{"location":"components-gallery/steps/apigenexecutionchecker/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[answers]\n end\n subgraph New columns\n OCOL0[keep_row_after_execution_check]\n OCOL1[execution_result]\n end\n end\n\n subgraph APIGenExecutionChecker\n StepInput[Input Columns: answers]\n StepOutput[Output Columns: keep_row_after_execution_check, execution_result]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/apigenexecutionchecker/#inputs","title":"Inputs","text":" - answers (
str ): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads . "},{"location":"components-gallery/steps/apigenexecutionchecker/#outputs","title":"Outputs","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#execute-a-function-from-a-given-library-with-the-answer-from-an-llm","title":"Execute a function from a given library with the answer from an LLM","text":"from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n task.process(\n [\n {\n \"answers\": [\n {\n \"arguments\": {\n \"initial_velocity\": 0.2,\n \"acceleration\": 0.1,\n \"time\": 0.5,\n },\n \"name\": \"final_velocity\",\n }\n ],\n }\n ]\n )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n "},{"location":"components-gallery/steps/apigenexecutionchecker/#references","title":"References","text":""},{"location":"components-gallery/steps/minhashdedup/","title":"MinHashDedup","text":"Deduplicates text using MinHash and MinHashLSH . MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH . 4. Check if the MinHash is already in the LSH , if so, it is a duplicate. "},{"location":"components-gallery/steps/minhashdedup/#attributes","title":"Attributes","text":" -
num_perm: the number of permutations to use. Defaults to 128 . -
seed: the seed to use for the MinHash. Defaults to 1 . -
tokenizer: the tokenizer to use. Available ones are words or ngrams . If words is selected, it tokenizes the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n ). Defaults to words . -
n: the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\" . Defaults to 5 . -
threshold: the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9 . -
storage: the storage to use for the LSH. Can be dict to store the index in memory, or disk . Keep in mind, disk is an experimental feature not defined in datasketch , that is based on DiskCache's Index class. It should work as a dict , but backed by disk, but depending on the system it can be slower. Defaults to dict . "},{"location":"components-gallery/steps/minhashdedup/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[keep_row_after_minhash_filtering]\n end\n end\n\n subgraph MinHashDedup\n StepInput[Input Columns: text]\n StepOutput[Output Columns: keep_row_after_minhash_filtering]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/minhashdedup/#inputs","title":"Inputs","text":" - text (
str ): the texts to be filtered. "},{"location":"components-gallery/steps/minhashdedup/#outputs","title":"Outputs","text":" - keep_row_after_minhash_filtering (
bool ): boolean indicating if the piece text is not a duplicate i.e. this text should be kept. "},{"location":"components-gallery/steps/minhashdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/minhashdedup/#deduplicate-a-list-of-texts-using-minhash-and-minhashlsh","title":"Deduplicate a list of texts using MinHash and MinHashLSH","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n ds_size = 1000\n batch_size = 500 # Bigger batch sizes work better for this step\n data = LoadDataFromDicts(\n data=[\n {\"text\": \"This is a test document.\"},\n {\"text\": \"This document is a test.\"},\n {\"text\": \"Test document for duplication.\"},\n {\"text\": \"Document for duplication test.\"},\n {\"text\": \"This is another unique document.\"},\n ]\n * (ds_size // 5),\n batch_size=batch_size,\n )\n minhash_dedup = MinHashDedup(\n tokenizer=\"words\",\n threshold=0.9, # lower values will increase the number of duplicates\n storage=\"dict\", # or \"disk\" for bigger datasets\n )\n\n data >> minhash_dedup\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(use_cache=False)\n ds = distiset[\"default\"][\"train\"]\n # Filter out the duplicates\n ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n "},{"location":"components-gallery/steps/minhashdedup/#references","title":"References","text":""},{"location":"components-gallery/steps/combineoutputs/","title":"CombineOutputs","text":"Combine the outputs of several upstream steps. CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph CombineOutputs\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/combineoutputs/#inputs","title":"Inputs","text":" - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#outputs","title":"Outputs","text":" - dynamic (based on the upstream
Step s): All the columns of the upstream steps outputs. "},{"location":"components-gallery/steps/combineoutputs/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/combineoutputs/#combine-dictionaries-of-a-dataset","title":"Combine dictionaries of a dataset","text":"from distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n combine_outputs.process(\n [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n )\n)\n# [\n# {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n# {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n "},{"location":"components-gallery/steps/combineoutputs/#combine-upstream-steps-outputs-in-a-pipeline","title":"Combine upstream steps outputs in a pipeline","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n step_1 = ...\n step_2 = ...\n step_3 = ...\n combine = CombineOutputs()\n\n [step_1, step_2, step_3] >> combine\n "},{"location":"components-gallery/steps/expandcolumns/","title":"ExpandColumns","text":"Expand columns that contain lists into multiple rows. ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list. "},{"location":"components-gallery/steps/expandcolumns/#attributes","title":"Attributes","text":" -
columns: A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name. -
encoded: A bool to inform Whether the columns are JSON encoded lists. If this value is set to True, the columns will be decoded before expanding. Alternatively, to specify columns that can be encoded, a list can be provided. In this case, the column names informed must be a subset of the columns selected for expansion. -
split_statistics: A bool to inform whether the statistics in the distilabel_metadata column should be split into multiple rows. If we want to expand some columns containing a list of strings that come from having parsed the output of an LLM, the tokens in the statistics_{step_name} of the distilabel_metadata column should be splitted to avoid multiplying them if we aggregate the data afterwards. For example, with a task that is supposed to generate a list of N instructions, and we want each of those N instructions in different rows, we should split the statistics by N. In such a case, set this value to True. "},{"location":"components-gallery/steps/expandcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph ExpandColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/expandcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to be expanded into multiple rows. "},{"location":"components-gallery/steps/expandcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns attribute): The expanded columns. "},{"location":"components-gallery/steps/expandcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-into-multiple-rows","title":"Expand the selected columns into multiple rows","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"]}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n "},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-which-are-json-encoded-into-multiple-rows","title":"Expand the selected columns which are JSON encoded into multiple rows","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n encoded=True, # It can also be a list of columns that are encoded, i.e. [\"generation\"]\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": '[\"generation 1\", \"generation 2\"]'}\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n "},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-and-split-the-statistics-in-the-distilabel_metadata-column","title":"Expand the selected columns and split the statistics in the distilabel_metadata column","text":"from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n columns=[\"generation\"],\n split_statistics=True,\n)\nexpand_columns.load()\n\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"instruction 1\",\n \"generation\": [\"generation 1\", \"generation 2\"],\n \"distilabel_metadata\": {\n \"statistics_generation\": {\n \"input_tokens\": [12],\n \"output_tokens\": [12],\n },\n },\n }\n ],\n )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]\n "},{"location":"components-gallery/steps/groupcolumns/","title":"GroupColumns","text":"Combines columns from a list of StepInput . GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput . Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs , respectively. "},{"location":"components-gallery/steps/groupcolumns/#attributes","title":"Attributes","text":""},{"location":"components-gallery/steps/groupcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph GroupColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/groupcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to group. "},{"location":"components-gallery/steps/groupcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns and output_columns attributes): The columns that were grouped. "},{"location":"components-gallery/steps/groupcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/groupcolumns/#group-columns-of-a-dataset","title":"Group columns of a dataset","text":"from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n "},{"location":"components-gallery/steps/groupcolumns/#specify-the-name-of-the-output-columns","title":"Specify the name of the output columns","text":"from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n name=\"group_columns\",\n columns=[\"generation\", \"model_name\"],\n output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n group_columns.process(\n [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n "},{"location":"components-gallery/steps/keepcolumns/","title":"KeepColumns","text":"Keeps selected columns in the dataset. KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs . "},{"location":"components-gallery/steps/keepcolumns/#note","title":"Note","text":"The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable. "},{"location":"components-gallery/steps/keepcolumns/#attributes","title":"Attributes","text":" - columns: List of strings with the names of the columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph KeepColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/keepcolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to keep. "},{"location":"components-gallery/steps/keepcolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns attribute): The columns that were kept. "},{"location":"components-gallery/steps/keepcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/keepcolumns/#select-the-columns-to-keep","title":"Select the columns to keep","text":"from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n keep_columns.process(\n [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n "},{"location":"components-gallery/steps/mergecolumns/","title":"MergeColumns","text":"Merge columns from a row. MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput . MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column. This step can be useful if you have a `Task` that generates instructions for example, and you\nwant to have more examples of those. In such a case, you could for example use another `Task`\nto multiply your instructions synthetically, what would yield two different columns splitted.\nUsing `MergeColumns` you can merge them and use them as a single column in your dataset for\nfurther processing.\n "},{"location":"components-gallery/steps/mergecolumns/#attributes","title":"Attributes","text":""},{"location":"components-gallery/steps/mergecolumns/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph MergeColumns\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/mergecolumns/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): The columns to merge. "},{"location":"components-gallery/steps/mergecolumns/#outputs","title":"Outputs","text":" - dynamic (determined by
columns and output_column attributes): The columns that were merged. "},{"location":"components-gallery/steps/mergecolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/mergecolumns/#combine-columns-in-rows-of-a-dataset","title":"Combine columns in rows of a dataset","text":"from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n columns=[\"queries\", \"multiple_queries\"],\n output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n combiner.process(\n [\n {\n \"queries\": \"How are you?\",\n \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n }\n ],\n )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n "},{"location":"components-gallery/steps/dbscan/","title":"DBSCAN","text":"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density. This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\nfrom `sklearn`. Visit `TextClustering` step for an example of use.\nThe trained model is saved as an artifact when creating a distiset\nand pushing it to the Hugging Face Hub.\n "},{"location":"components-gallery/steps/dbscan/#attributes","title":"Attributes","text":" - eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If
min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs: The number of parallel jobs to run. "},{"location":"components-gallery/steps/dbscan/#runtime-parameters","title":"Runtime Parameters","text":" -
eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. -
min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. -
metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. -
n_jobs: The number of parallel jobs to run. "},{"location":"components-gallery/steps/dbscan/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[projection]\n end\n subgraph New columns\n OCOL0[cluster_label]\n end\n end\n\n subgraph DBSCAN\n StepInput[Input Columns: projection]\n StepOutput[Output Columns: cluster_label]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/dbscan/#inputs","title":"Inputs","text":" - projection (
List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. "},{"location":"components-gallery/steps/dbscan/#outputs","title":"Outputs","text":" - cluster_label (
int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. "},{"location":"components-gallery/steps/dbscan/#references","title":"References","text":" -
DBSCAN demo of sklearn -
sklearn dbscan "},{"location":"components-gallery/steps/umap/","title":"UMAP","text":"UMAP is a general purpose manifold learning and dimension reduction algorithm. This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub. "},{"location":"components-gallery/steps/umap/#attributes","title":"Attributes","text":" - n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to
euclidean . - n_jobs: The number of parallel jobs to run. Defaults to 8 . - random_state: The random state to use for the UMAP algorithm. "},{"location":"components-gallery/steps/umap/#runtime-parameters","title":"Runtime Parameters","text":" -
n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. -
metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean . -
n_jobs: The number of parallel jobs to run. Defaults to 8 . -
random_state: The random state to use for the UMAP algorithm. "},{"location":"components-gallery/steps/umap/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[embedding]\n end\n subgraph New columns\n OCOL0[projection]\n end\n end\n\n subgraph UMAP\n StepInput[Input Columns: embedding]\n StepOutput[Output Columns: projection]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/umap/#inputs","title":"Inputs","text":" - embedding (
List[float] ): The original embeddings we want to reduce the dimension. "},{"location":"components-gallery/steps/umap/#outputs","title":"Outputs","text":" - projection (
List[float] ): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components . "},{"location":"components-gallery/steps/umap/#references","title":"References","text":" -
UMAP repository -
UMAP documentation "},{"location":"components-gallery/steps/faissnearestneighbour/","title":"FaissNearestNeighbour","text":"Create a faiss index to get the nearest neighbours. FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row. "},{"location":"components-gallery/steps/faissnearestneighbour/#attributes","title":"Attributes","text":" -
device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . -
string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . -
metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . -
k: the number of nearest neighbours to search for each input row. Defaults to 1 . -
search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . -
train_size: If the index needs a training step, specifies how many vectors will be used to train the index. "},{"location":"components-gallery/steps/faissnearestneighbour/#runtime-parameters","title":"Runtime Parameters","text":" -
device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None . -
string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None . -
metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None . -
k: the number of nearest neighbours to search for each input row. Defaults to 1 . -
search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50 . -
train_size: If the index needs a training step, specifies how many vectors will be used to train the index. "},{"location":"components-gallery/steps/faissnearestneighbour/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[embedding]\n end\n subgraph New columns\n OCOL0[nn_indices]\n OCOL1[nn_scores]\n end\n end\n\n subgraph FaissNearestNeighbour\n StepInput[Input Columns: embedding]\n StepOutput[Output Columns: nn_indices, nn_scores]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/faissnearestneighbour/#inputs","title":"Inputs","text":" - embedding (
List[Union[float, int]] ): a sentence embedding. "},{"location":"components-gallery/steps/faissnearestneighbour/#outputs","title":"Outputs","text":" -
nn_indices (List[int] ): a list containing the indices of the k nearest neighbours in the inputs for the row. -
nn_scores (List[float] ): a list containing the score or distance to each k nearest neighbour in the inputs. "},{"location":"components-gallery/steps/faissnearestneighbour/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/faissnearestneighbour/#generating-embeddings-and-getting-the-nearest-neighbours","title":"Generating embeddings and getting the nearest neighbours","text":"from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n embeddings = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\"\n )\n )\n\n nearest_neighbours = FaissNearestNeighbour()\n\n load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n distiset = pipeline.run(\n parameters={\n load_data.name: {\n \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n \"split\": \"test\",\n },\n },\n use_cache=False,\n )\n "},{"location":"components-gallery/steps/faissnearestneighbour/#references","title":"References","text":""},{"location":"components-gallery/steps/embeddinggeneration/","title":"EmbeddingGeneration","text":"Generate embeddings using an Embeddings model. EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts. "},{"location":"components-gallery/steps/embeddinggeneration/#attributes","title":"Attributes","text":" - embeddings: the
Embeddings model used to generate the sentence embeddings. "},{"location":"components-gallery/steps/embeddinggeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[embedding]\n end\n end\n\n subgraph EmbeddingGeneration\n StepInput[Input Columns: text]\n StepOutput[Output Columns: embedding]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/embeddinggeneration/#inputs","title":"Inputs","text":" - text (
str ): The text for which the sentence embedding has to be generated. "},{"location":"components-gallery/steps/embeddinggeneration/#outputs","title":"Outputs","text":" - embedding (
List[Union[float, int]] ): the generated sentence embedding. "},{"location":"components-gallery/steps/embeddinggeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddinggeneration/#generate-sentence-embeddings-with-sentence-transformers","title":"Generate sentence embeddings with Sentence Transformers","text":"from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n embeddings=SentenceTransformerEmbeddings(\n model=\"mixedbread-ai/mxbai-embed-large-v1\",\n )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n "},{"location":"components-gallery/steps/rewardmodelscore/","title":"RewardModelScore","text":"Assign a score to a response using a Reward Model. RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers , assigns an score to a response generated for an instruction, or a score to a multi-turn conversation. "},{"location":"components-gallery/steps/rewardmodelscore/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
truncation: whether to truncate sequences at the maximum length. Defaults to False . -
max_length: maximun length to use for padding or truncation. Defaults to None . "},{"location":"components-gallery/steps/rewardmodelscore/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n ICOL2[conversation]\n end\n subgraph New columns\n OCOL0[score]\n end\n end\n\n subgraph RewardModelScore\n StepInput[Input Columns: instruction, response, conversation]\n StepOutput[Output Columns: score]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/rewardmodelscore/#inputs","title":"Inputs","text":" -
instruction (str , optional): the instruction used to generate a response . If provided, then response must be provided too. -
response (str , optional): the response generated for instruction . If provided, then instruction must be provide too. -
conversation (ChatType , optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided. "},{"location":"components-gallery/steps/rewardmodelscore/#outputs","title":"Outputs","text":" - score (
float ): the score given by the reward model for the instruction-response pair or the conversation. "},{"location":"components-gallery/steps/rewardmodelscore/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/rewardmodelscore/#response-pair","title":"response pair","text":"from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"instruction\": \"How much is 2+2?\",\n \"response\": \"The output of 2+2 is 4\",\n },\n {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n ]\n )\n)\n# [\n# {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n# {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n "},{"location":"components-gallery/steps/rewardmodelscore/#turn-conversation","title":"turn conversation","text":"from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n step.process(\n inputs=[\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n ],\n },\n {\n \"conversation\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n {\"role\": \"assistant\", \"content\": \"4\"},\n ],\n },\n ]\n )\n)\n# [\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n "},{"location":"components-gallery/steps/formatprm/","title":"FormatPRM","text":"Helper step to transform the data into the format expected by the PRM model. This step can be used to format the data in one of 2 formats: Following the format presented in peiyi9979/Math-Shepherd, in which case this step creates the columns input and label, where the input is the instruction with the solution (and the tag replaced by a token), and the label is the instruction with the solution, both separated by a newline. Following TRL's format for training, which generates the columns prompt, completions, and labels. The labels correspond to the original tags replaced by boolean values, where True represents correct steps. "},{"location":"components-gallery/steps/formatprm/#attributes","title":"Attributes","text":" -
format: The format to use for the PRM model. \"math-shepherd\" corresponds to the original paper, while \"trl\" is a format prepared to train the model using TRL. -
step_token: String that serves as a unique token denoting the position for predicting the step score. -
tags: List of tags that represent the correct and incorrect steps. This only needs to be informed if it's different than the default in MathShepherdCompleter . "},{"location":"components-gallery/steps/formatprm/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[solutions]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[label]\n OCOL2[prompt]\n OCOL3[completions]\n OCOL4[labels]\n end\n end\n\n subgraph FormatPRM\n StepInput[Input Columns: instruction, solutions]\n StepOutput[Output Columns: input, label, prompt, completions, labels]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/formatprm/#inputs","title":"Inputs","text":""},{"location":"components-gallery/steps/formatprm/#outputs","title":"Outputs","text":" -
input (str ): The instruction with the solutions, where the label tags are replaced by a token. -
label (str ): The instruction with the solutions. -
prompt (str ): The instruction with the solutions, where the label tags are replaced by a token. -
completions (List[str] ): The solution represented as a list of steps. -
labels (List[bool] ): The labels, as a list of booleans, where True represents a good response. "},{"location":"components-gallery/steps/formatprm/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatprm/#shepherd-format","title":"Shepherd format","text":"from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM()\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\nresult = next(formatter.process(result))\n "},{"location":"components-gallery/steps/formatprm/#prepare-your-data-to-train-a-prm-model-with-the-trl-format","title":"Prepare your data to train a PRM model with the TRL format","text":"from distilabel.steps.tasks import FormatPRM\nfrom distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(columns=[\"solutions\"])\nexpand_columns.load()\n\n# Define our PRM formatter\nformatter = FormatPRM(format=\"trl\")\nformatter.load()\n\n# Expand the solutions column as it comes from the MathShepherdCompleter\nresult = next(\n expand_columns.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"solutions\": [[\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can multiply 2 by 0.5 (which is the same as dividing by 2): 2 * 0.5 = <<2*0.5=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"], [\"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\", \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\", \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"]]\n },\n ]\n )\n)\n\nresult = next(formatter.process(result))\n# {\n# \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"solutions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required. +\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber. +\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3 +\"\n# ],\n# \"prompt\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# \"completions\": [\n# \"Step 1: Determine the amount of blue fiber needed: 2 bolts of blue fiber are required.\",\n# \"Step 2: Calculate the amount of white fiber needed: Since it's half that much, we can divide 2 by 2: 2 / 2 = <<2/2=1>>1 bolt of white fiber.\",\n# \"Step 3: Add the amount of blue and white fiber: 2 (blue) + 1 (white) = <<2+1=3>>3 bolts of fiber in total. The answer is: 3\"\n# ],\n# \"labels\": [\n# true,\n# true,\n# true\n# ]\n# }\n "},{"location":"components-gallery/steps/formatprm/#references","title":"References","text":""},{"location":"components-gallery/steps/truncatetextcolumn/","title":"TruncateTextColumn","text":"Truncate a row using a tokenizer or the number of characters. TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length. "},{"location":"components-gallery/steps/truncatetextcolumn/#attributes","title":"Attributes","text":" -
column: the column to truncate. Defaults to \"text\" . -
max_length: the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192 . -
tokenizer: the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None . "},{"location":"components-gallery/steps/truncatetextcolumn/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[dynamic]\n end\n end\n\n subgraph TruncateTextColumn\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: dynamic]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/steps/truncatetextcolumn/#inputs","title":"Inputs","text":" - dynamic (determined by
column attribute): The columns to be truncated, defaults to \"text\". "},{"location":"components-gallery/steps/truncatetextcolumn/#outputs","title":"Outputs","text":" - dynamic (determined by
column attribute): The truncated column. "},{"location":"components-gallery/steps/truncatetextcolumn/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-tokens","title":"Truncating a row to a given number of tokens","text":"from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n max_length=4,\n column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a sample'}]\n "},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-characters","title":"Truncating a row to a given number of characters","text":"from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n trunc.process(\n [\n {\"text\": \"This is a sample text that is longer than 10 characters\"}\n ]\n )\n)\n# result\n# [{'text': 'This is a '}]\n "},{"location":"components-gallery/tasks/","title":"Tasks Gallery","text":"Category Overview The gallery page showcases the different types of components within distilabel . Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data. labelling Labelling steps are used to label the data. -
APIGenGenerator Generate queries and answers for the given functions in JSON format. APIGenGenerator -
Genstruct Generate a pair of instruction-response from a document using an LLM . Genstruct -
Magpie Generates conversations using an instruct fine-tuned LLM. Magpie -
MathShepherdCompleter Math Shepherd Completer and auto-labeller task. MathShepherdCompleter -
MathShepherdGenerator Math Shepherd solution generator. MathShepherdGenerator -
SelfInstruct Generate instructions based on a given input using an LLM . SelfInstruct -
TextGeneration Text generation with an LLM given a prompt. TextGeneration -
TextGenerationWithImage Text generation with images with an LLM given a prompt. TextGenerationWithImage -
URIAL Generates a response using a non-instruct fine-tuned model. URIAL -
MagpieGenerator Generator task the generates instructions or conversations using Magpie. MagpieGenerator -
ChatGeneration Generates text based on a conversation. ChatGeneration -
ArgillaLabeller Annotate Argilla records based on input fields, example records and question settings. ArgillaLabeller -
TextClassification Classifies text into one or more categories or labels. TextClassification -
EvolInstruct Evolve instructions using an LLM . EvolInstruct -
EvolComplexity Evolve instructions to make them more complex using an LLM . EvolComplexity -
EvolQuality Evolve the quality of the responses using an LLM . EvolQuality -
EvolInstructGenerator Generate evolved instructions using an LLM . EvolInstructGenerator -
EvolComplexityGenerator Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator -
InstructionBacktranslation Self-Alignment with Instruction Backtranslation. InstructionBacktranslation -
PrometheusEval Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval -
ComplexityScorer Score instructions based on their complexity using an LLM . ComplexityScorer -
QualityScorer Score responses based on their quality using an LLM . QualityScorer -
CLAIR Contrastive Learning from AI Revisions (CLAIR). CLAIR -
UltraFeedback Rank generations focusing on different aspects using an LLM . UltraFeedback -
PairRM Rank the candidates based on the input using the LLM model. PairRM -
GenerateSentencePair Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair -
GenerateEmbeddings Generate embeddings using the last hidden state of an LLM . GenerateEmbeddings -
TextClustering Task that clusters a set of texts and generates summary labels for each cluster. TextClustering -
TextClustering Task that clusters a set of texts and generates summary labels for each cluster. TextClustering -
APIGenSemanticChecker Generate queries and answers for the given functions in JSON format. APIGenSemanticChecker -
GenerateTextRetrievalData Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData -
GenerateShortTextMatchingData Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData -
GenerateLongTextMatchingData Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData -
GenerateTextClassificationData Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData -
StructuredGeneration Generate structured content for a given instruction using an LLM . StructuredGeneration -
MonolingualTripletGenerator Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator -
BitextRetrievalGenerator Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator -
EmbeddingTaskGenerator Generate task descriptions for embedding-related tasks using an LLM . EmbeddingTaskGenerator "},{"location":"components-gallery/tasks/apigengenerator/","title":"APIGenGenerator","text":"Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. "},{"location":"components-gallery/tasks/apigengenerator/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to guide the user in the generation of queries and answers. -
use_tools: Whether to use the tools available in the prompt to generate the queries and answers. In case the tools are given in the input, they will be added to the prompt. -
number: The number of queries to generate. It can be a list, where each number will be chosen randomly, or a dictionary with the number of queries and the probability of each. I.e: number=1 , number=[1, 2, 3] , number={1: 0.5, 2: 0.3, 3: 0.2} are all valid inputs. It corresponds to the number of parallel queries to generate. -
use_default_structured_output: Whether to use the default structured output or not. "},{"location":"components-gallery/tasks/apigengenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[examples]\n ICOL1[func_name]\n ICOL2[func_desc]\n ICOL3[tools]\n end\n subgraph New columns\n OCOL0[query]\n OCOL1[answers]\n end\n end\n\n subgraph APIGenGenerator\n StepInput[Input Columns: examples, func_name, func_desc, tools]\n StepOutput[Output Columns: query, answers]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/apigengenerator/#inputs","title":"Inputs","text":" -
examples (str ): Examples used as few shots to guide the model. -
func_name (str ): Name for the function to generate. -
func_desc (str ): Description of what the function should do. -
tools (str ): JSON formatted string containing the tool representation of the function. "},{"location":"components-gallery/tasks/apigengenerator/#outputs","title":"Outputs","text":" -
query (str ): The list of queries. -
answers (str ): JSON formatted string with the list of answers, containing the info as a dictionary to be passed to the functions. "},{"location":"components-gallery/tasks/apigengenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigengenerator/#generate-without-structured-output-original-implementation","title":"Generate without structured output (original implementation)","text":"from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\napigen = ApiGenGenerator(\n use_default_structured_output=False,\n llm=llm\n)\napigen.load()\n\nres = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n)\nres\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n# 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n# 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n# [{'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}},\n# {'name': 'getrandommovie', 'arguments': {}}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n# 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n# {'role': 'user',\n# 'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\n "},{"location":"components-gallery/tasks/apigengenerator/#generate-with-structured-output","title":"Generate with structured output","text":"from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\napigen = ApiGenGenerator(\n use_default_structured_output=True,\n llm=llm\n)\napigen.load()\n\nres_struct = next(\n apigen.process(\n [\n {\n \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n \"func_name\": \"getrandommovie\",\n \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n }\n ]\n )\n)\nres_struct\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n# \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n# 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n# [{'arguments': {}, 'name': 'getrandommovie'}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n# 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n# {'role': 'user',\n# 'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigengenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/genstruct/","title":"Genstruct","text":"Generate a pair of instruction-response from a document using an LLM . Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper. "},{"location":"components-gallery/tasks/genstruct/#note","title":"Note","text":"The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task. "},{"location":"components-gallery/tasks/genstruct/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/genstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[title]\n ICOL1[content]\n end\n subgraph New columns\n OCOL0[user]\n OCOL1[assistant]\n OCOL2[model_name]\n end\n end\n\n subgraph Genstruct\n StepInput[Input Columns: title, content]\n StepOutput[Output Columns: user, assistant, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/genstruct/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/genstruct/#outputs","title":"Outputs","text":" -
user (str ): The user's instruction based on the document. -
assistant (str ): The assistant's response based on the user's instruction. -
model_name (str ): The model name used to generate the feedback and result . "},{"location":"components-gallery/tasks/genstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/genstruct/#generate-instructions-from-raw-documents-using-the-title-and-content","title":"Generate instructions from raw documents using the title and content","text":"from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"NousResearch/Genstruct-7B\",\n ),\n)\n\ngenstruct.load()\n\nresult = next(\n genstruct.process(\n [\n {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'title': 'An instruction',\n# 'content': 'content of the document',\n# 'model_name': 'test',\n# 'user': 'An instruction',\n# 'assistant': 'content of the document',\n# }\n# ]\n "},{"location":"components-gallery/tasks/genstruct/#references","title":"References","text":""},{"location":"components-gallery/tasks/magpie/","title":"Magpie","text":"Generates conversations using an instruct fine-tuned LLM. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. "},{"location":"components-gallery/tasks/magpie/#attributes","title":"Attributes","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . "},{"location":"components-gallery/tasks/magpie/#runtime-parameters","title":"Runtime Parameters","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. "},{"location":"components-gallery/tasks/magpie/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[system_prompt]\n end\n subgraph New columns\n OCOL0[conversation]\n OCOL1[instruction]\n OCOL2[response]\n OCOL3[system_prompt_key]\n OCOL4[model_name]\n end\n end\n\n subgraph Magpie\n StepInput[Input Columns: system_prompt]\n StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/magpie/#inputs","title":"Inputs","text":" - system_prompt (
str , optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic. "},{"location":"components-gallery/tasks/magpie/#outputs","title":"Outputs","text":" -
conversation (ChatType ): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False . -
instruction (str ): the generated instructions if only_instruction=True or n_turns==1 . -
response (str ): the generated response if n_turns==1 . -
system_prompt_key (str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. -
model_name (str ): The model name used to generate the conversation or instruction . "},{"location":"components-gallery/tasks/magpie/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpie/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n# {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n "},{"location":"components-gallery/tasks/magpie/#generating-conversations-with-llama-3-8b-instruct-and-transformersllm","title":"Generating conversations with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n magpie.process(\n inputs=[\n {\n \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n },\n {\n \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n },\n ]\n )\n)\n# [\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n# {\n# 'role': 'user',\n# 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n# {\n# 'role': 'user',\n# 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n# }\n# ]\n# }\n# ]\n "},{"location":"components-gallery/tasks/magpie/#references","title":"References","text":" - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/mathshepherdcompleter/","title":"MathShepherdCompleter","text":"Math Shepherd Completer and auto-labeller task. This task is in charge of, given a list of solutions to an instruction, and a golden solution, as reference, generate completions for the solutions, and label them according to the golden solution using the hard estimation method from figure 2 in the reference paper, Eq. 3. The attributes make the task flexible to be used with different types of dataset and LLMs, and allow making use of different fields to modify the system and user prompts for it. Before modifying them, review the current defaults to ensure the completions are generated correctly. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. -
extra_rules: This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. -
few_shots: Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. -
N: Number of completions to generate for each step, correspond to N in the paper. They used 8 in the paper, but it can be adjusted. -
tags: List of tags to be used in the completions, the default ones are [\"+\", \"-\"] as in the paper, where the first is used as a positive label, and the second as a negative one. This can be updated, but it MUST be a list with 2 elements, where the first is the positive one, and the second the negative one. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[solutions]\n ICOL2[golden_solution]\n end\n subgraph New columns\n OCOL0[solutions]\n OCOL1[model_name]\n end\n end\n\n subgraph MathShepherdCompleter\n StepInput[Input Columns: instruction, solutions, golden_solution]\n StepOutput[Output Columns: solutions, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#inputs","title":"Inputs","text":" -
instruction (str ): The task or instruction. -
solutions (List[str] ): List of solutions to the task. -
golden_solution (str ): The reference solution to the task, will be used to annotate the candidate solutions. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#outputs","title":"Outputs","text":" -
solutions (List[str] ): The same columns that were used as input, the \"solutions\" is modified. -
model_name (str ): The name of the model used to generate the revision. "},{"location":"components-gallery/tasks/mathshepherdcompleter/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/mathshepherdcompleter/#annotate-your-steps-with-the-math-shepherd-completer-using-the-structured-outputs-the-preferred-way","title":"Annotate your steps with the Math Shepherd Completer using the structured outputs (the preferred way)","text":"from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3,\n use_default_structured_output=True\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#annotate-your-steps-with-the-math-shepherd-completer","title":"Annotate your steps with the Math Shepherd Completer","text":"from distilabel.steps.tasks import MathShepherdCompleter\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdCompleter(\n llm=llm,\n N=3\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n \"golden_solution\": [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n \"solutions\": [\n [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\", \"The answer is: 18\"],\n ['Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking.', 'Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day.', 'Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.', 'The answer is: 18'],\n ]\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': [\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"],\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdcompleter/#references","title":"References","text":" - Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations
"},{"location":"components-gallery/tasks/mathshepherdgenerator/","title":"MathShepherdGenerator","text":"Math Shepherd solution generator. This task is in charge of generating completions for a given instruction, in the format expected by the Math Shepherd Completer task. The attributes make the task flexible to be used with different types of dataset and LLMs, but we provide examples for the GSM8K and MATH datasets as presented in the original paper. Before modifying them, review the current defaults to ensure the completions are generated correctly. This task can be used to generate the golden solutions for a given problem if not provided, as well as possible solutions to be then labeled by the Math Shepherd Completer. Only one of solutions or golden_solution will be generated, depending on the value of M. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to be used in the completions. The default one has been checked and generates good completions using Llama 3.1 with 8B and 70B, but it can be modified to adapt it to the model and dataset selected. Take into account that the system prompt includes 2 variables in the Jinja2 template, {{extra_rules}} and {{few_shot}}. These variables are used to include extra rules, for example to steer the model towards a specific type of responses, and few shots to add examples. They can be modified to adapt the system prompt to the dataset and model used without needing to change the full system prompt. -
extra_rules: This field can be used to insert extra rules relevant to the type of dataset. For example, in the original paper they used GSM8K and MATH datasets, and this field can be used to insert the rules for the GSM8K dataset. -
few_shots: Few shots to help the model generating the completions, write them in the format of the type of solutions wanted for your dataset. -
M: Number of completions to generate for each step. By default is set to 1, which will generate the \"golden_solution\". In this case select a stronger model, as it will be used as the source of true during labelling. If M is set to a number greater than 1, the task will generate a list of completions to be labeled by the Math Shepherd Completer task. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[golden_solution]\n OCOL1[solutions]\n OCOL2[model_name]\n end\n end\n\n subgraph MathShepherdGenerator\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: golden_solution, solutions, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#inputs","title":"Inputs","text":" - instruction (
str ): The task or instruction. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#outputs","title":"Outputs","text":" -
golden_solution (str ): The step by step solution to the instruction. It will be generated if M is equal to 1. -
solutions (List[List[str]] ): A list of possible solutions to the instruction. It will be generated if M is greater than 1. -
model_name (str ): The name of the model used to generate the revision. "},{"location":"components-gallery/tasks/mathshepherdgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/mathshepherdgenerator/#generate-the-solution-for-a-given-instruction-prefer-a-stronger-model-here","title":"Generate the solution for a given instruction (prefer a stronger model here)","text":"from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.6,\n \"max_new_tokens\": 1024,\n },\n)\ntask = MathShepherdGenerator(\n name=\"golden_solution_generator\",\n llm=llm,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'golden_solution': '[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"]'}]]\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#generate-m-completions-for-a-given-instruction-using-structured-output-generation","title":"Generate M completions for a given instruction (using structured output generation)","text":"from distilabel.steps.tasks import MathShepherdGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 2048,\n },\n)\ntask = MathShepherdGenerator(\n name=\"solution_generator\",\n llm=llm,\n M=2,\n use_default_structured_output=True,\n)\n\ntask.load()\n\nresult = next(\n task.process(\n [\n {\n \"instruction\": \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n },\n ]\n )\n)\n# [[{'instruction': \"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\",\n# 'solutions': [[\"Step 1: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day. -\", \"Step 2: She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\\u2019s market.\", \"The answer is: 18\"], [\"Step 1: Janets ducks lay 16 eggs per day, and she uses 3 + 4 = <<3+4=7>>7 for eating and baking. +\", \"Step 2: So she sells 16 - 7 = <<16-7=9>>9 duck eggs every day. +\", \"Step 3: Those 9 eggs are worth 9 * $2 = $<<9*2=18>>18.\", \"The answer is: 18\"]]}]]\n "},{"location":"components-gallery/tasks/mathshepherdgenerator/#references","title":"References","text":" - Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations
"},{"location":"components-gallery/tasks/selfinstruct/","title":"SelfInstruct","text":"Generate instructions based on a given input using an LLM . SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\". "},{"location":"components-gallery/tasks/selfinstruct/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. Defaults to 5. -
criteria_for_query_generation: The criteria for the query generation. Defaults to the criteria defined within the paper. -
application_description: The description of the AI application that one want to build with these instructions. Defaults to AI assistant . "},{"location":"components-gallery/tasks/selfinstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[input]\n end\n subgraph New columns\n OCOL0[instructions]\n OCOL1[model_name]\n end\n end\n\n subgraph SelfInstruct\n StepInput[Input Columns: input]\n StepOutput[Output Columns: instructions, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/selfinstruct/#inputs","title":"Inputs","text":" - input (
str ): The input to generate the instructions. It's also called seed in the paper. "},{"location":"components-gallery/tasks/selfinstruct/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/selfinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/selfinstruct/#generate-instructions-based-on-a-given-input","title":"Generate instructions based on a given input","text":"from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=5, # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n# {\n# 'input': 'instruction',\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/","title":"TextGeneration","text":"Text generation with an LLM given a prompt. TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM. "},{"location":"components-gallery/tasks/textgeneration/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None . -
template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. -
columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . -
use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. "},{"location":"components-gallery/tasks/textgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph TextGeneration\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textgeneration/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a List[str] to be used in the template. "},{"location":"components-gallery/tasks/textgeneration/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgeneration/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n )\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [{\"instruction\": \"your instruction\"}]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'your instruction',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'generation',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#use-a-custom-template-to-generate-text","title":"Use a custom template to generate text","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n template=CUSTOM_TEMPLATE,\n columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n# 'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#few-shot-learning-with-different-system-prompts","title":"Few shot learning with different system prompts","text":"from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n template=CUSTOM_TEMPLATE,\n columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n text_gen.process(\n [\n {\n \"examples\": [\"This is an example\", \"Another relevant example\"],\n \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'examples': ['This is an example', 'Another relevant example'],\n# 'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n# 'generation': 'Disable the firewall on the router',\n# }\n# ]\n "},{"location":"components-gallery/tasks/textgeneration/#references","title":"References","text":" - Jinja2 Template Designer Documentation
"},{"location":"components-gallery/tasks/textgenerationwithimage/","title":"TextGenerationWithImage","text":"Text generation with images with an LLM given a prompt. TextGenerationWithImage is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. Additionally, an image column is expected containing one of the url, base64 encoded image or PIL image. This task inherits from TextGeneration , so all the functionality available in that task related to the prompt will be available here too. "},{"location":"components-gallery/tasks/textgenerationwithimage/#attributes","title":"Attributes","text":" -
system_prompt: The system prompt to use in the generation. If not, then no system prompt will be used. Defaults to None . -
template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template. -
columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction . -
image_type: The type of the image provided, this will be used to preprocess if necessary. Must be one of \"url\", \"base64\" or \"PIL\". "},{"location":"components-gallery/tasks/textgenerationwithimage/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[dynamic]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph TextGenerationWithImage\n StepInput[Input Columns: dynamic]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#inputs","title":"Inputs","text":" - dynamic (determined by
columns attribute): By default will be set to instruction . The columns can point both to a str or a list[str] to be used in the template. "},{"location":"components-gallery/tasks/textgenerationwithimage/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textgenerationwithimage/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgenerationwithimage/#answer-questions-from-an-image","title":"Answer questions from an image","text":"from distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"url\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\"\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"What\u2019s in this image?\",\n# \"image\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n# \"generation\": \"Based on the visual cues in the image...\",\n# \"model_name\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n# ... # distilabel_metadata would be here\n# }\n# ]\n# result[0][\"generation\"]\n# \"Based on the visual cues in the image, here are some possible story points:\n\n* The image features a wooden boardwalk leading through a lush grass field, possibly in a park or nature reserve.\n\nAnalysis and Ideas:\n* The abundance of green grass and trees suggests a healthy ecosystem or habitat.\n* The presence of wildlife, such as birds or deer, is possible based on the surroundings.\n* A footbridge or a pathway might be a common feature in this area, providing access to nearby attractions or points of interest.\n\nAdditional Questions to Ask:\n* Why is a footbridge present in this area?\n* What kind of wildlife inhabits this region\"\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#answer-questions-from-an-image-stored-as-base64","title":"Answer questions from an image stored as base64","text":"# For this example we will assume that we have the string representation of the image\n# stored, but will just take the image and transform it to base64 to ilustrate the example.\nimport requests\nimport base64\n\nimage_url =\"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimg = requests.get(image_url).content\nbase64_image = base64.b64encode(img).decode(\"utf-8\")\n\nfrom distilabel.steps.tasks import TextGenerationWithImage\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nvision = TextGenerationWithImage(\n name=\"vision_gen\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n ),\n image_type=\"base64\"\n)\n\nvision.load()\n\nresult = next(\n vision.process(\n [\n {\n \"instruction\": \"What\u2019s in this image?\",\n \"image\": base64_image\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/textgenerationwithimage/#references","title":"References","text":""},{"location":"components-gallery/tasks/urial/","title":"URIAL","text":"Generates a response using a non-instruct fine-tuned model. URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input. "},{"location":"components-gallery/tasks/urial/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[conversation]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph URIAL\n StepInput[Input Columns: instruction, conversation]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/urial/#inputs","title":"Inputs","text":" -
instruction (str , optional): The instruction to generate a response from. -
conversation (List[Dict[str, str]] , optional): The conversation to generate a response from (the last message must be from the user). "},{"location":"components-gallery/tasks/urial/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/urial/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/urial/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n llm=vLLM(\n model=\"meta-llama/Meta-Llama-3.1-8B\",\n generation_kwargs={\"temperature\": 0.7},\n ),\n)\n\nstep.load()\n\nresults = next(\n step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n# {\n# 'instruction': \"What's the most most common type of cloud?\",\n# 'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n# 'distilabel_metadata': {...},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n# }\n# ]\n "},{"location":"components-gallery/tasks/urial/#references","title":"References","text":" - The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
"},{"location":"components-gallery/tasks/magpiegenerator/","title":"MagpieGenerator","text":"Generator task the generates instructions or conversations using Magpie. Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'. "},{"location":"components-gallery/tasks/magpiegenerator/#attributes","title":"Attributes","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None . -
num_rows: the number of rows to be generated. "},{"location":"components-gallery/tasks/magpiegenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
n_turns: the number of turns that the generated conversation will have. Defaults to 1 . -
end_with_user: whether the conversation should end with a user message. Defaults to False . -
include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False . -
only_instruction: whether to generate only the instruction. If this argument is True , then n_turns will be ignored. Defaults to False . -
system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. -
num_rows: the number of rows to be generated. "},{"location":"components-gallery/tasks/magpiegenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[conversation]\n OCOL1[instruction]\n OCOL2[response]\n OCOL3[system_prompt_key]\n OCOL4[model_name]\n end\n end\n\n subgraph MagpieGenerator\n StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n "},{"location":"components-gallery/tasks/magpiegenerator/#outputs","title":"Outputs","text":" -
conversation (ChatType ): the generated conversation which is a list of chat items with a role and a message. -
instruction (str ): the generated instructions if only_instruction=True . -
response (str ): the generated response if n_turns==1 . -
system_prompt_key (str , optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary. -
model_name (str ): The model name used to generate the conversation or instruction . "},{"location":"components-gallery/tasks/magpiegenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpiegenerator/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 256,\n },\n device=\"mps\",\n ),\n only_instruction=True,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n# {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n# ],\n# True\n# )\n "},{"location":"components-gallery/tasks/magpiegenerator/#generating-a-conversation-with-llama-3-8b-instruct-and-transformersllm","title":"Generating a conversation with Llama 3 8B Instruct and TransformersLLM","text":"from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n llm=TransformersLLM(\n model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 1.0,\n \"max_new_tokens\": 64,\n },\n device=\"mps\",\n ),\n n_turns=3,\n num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n# [\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n# {\n# 'role': 'assistant',\n# 'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n# },\n# {\n# 'role': 'user',\n# 'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n# },\n# {\n# 'role': 'assistant',\n# 'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n# }\n# ]\n# },\n# {\n# 'conversation': [\n# {\n# 'role': 'system',\n# 'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n# },\n# {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n# {\n# 'role': 'assistant',\n# 'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n# },\n# {\n# 'role': 'user',\n# 'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n# },\n# {\n# 'role': 'assistant',\n# 'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n# }\n# ]\n# }\n# ],\n# True\n# )\n "},{"location":"components-gallery/tasks/magpiegenerator/#generating-with-system-prompts-with-probabilities","title":"Generating with system prompts with probabilities","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n magpie_pre_query_template=\"llama3\",\n generation_kwargs={\n \"temperature\": 0.8,\n \"max_new_tokens\": 256,\n },\n ),\n n_turns=2,\n system_prompt={\n \"math\": (\"You're an expert AI assistant.\", 0.8),\n \"writing\": (\"You're an expert writing assistant.\", 0.2),\n },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n "},{"location":"components-gallery/tasks/magpiegenerator/#references","title":"References","text":" - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/chatgeneration/","title":"ChatGeneration","text":"Generates text based on a conversation. ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it. "},{"location":"components-gallery/tasks/chatgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[messages]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph ChatGeneration\n StepInput[Input Columns: messages]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/chatgeneration/#inputs","title":"Inputs","text":" - messages (
List[Dict[Literal[\"role\", \"content\"], str]] ): The messages to generate the follow up completion from. "},{"location":"components-gallery/tasks/chatgeneration/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/chatgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/chatgeneration/#generate-text-from-a-conversation-in-openai-chat-format","title":"Generate text from a conversation in OpenAI chat format","text":"from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nchat.load()\n\nresult = next(\n chat.process(\n [\n {\n \"messages\": [\n {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n ]\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# 'generation': '4',\n# }\n# ]\n "},{"location":"components-gallery/tasks/argillalabeller/","title":"ArgillaLabeller","text":"Annotate Argilla records based on input fields, example records and question settings. This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation. "},{"location":"components-gallery/tasks/argillalabeller/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/argillalabeller/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[record]\n ICOL1[fields]\n ICOL2[question]\n ICOL3[example_records]\n ICOL4[guidelines]\n end\n subgraph New columns\n OCOL0[suggestion]\n end\n end\n\n subgraph ArgillaLabeller\n StepInput[Input Columns: record, fields, question, example_records, guidelines]\n StepOutput[Output Columns: suggestion]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n ICOL4 --> StepInput\n StepOutput --> OCOL0\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/argillalabeller/#inputs","title":"Inputs","text":" -
record (argilla.Record ): The record to be annotated. -
fields (Optional[List[Dict[str, Any]]] ): The list of field settings for the input fields. -
question (Optional[Dict[str, Any]] ): The question settings for the question to be answered. -
example_records (Optional[List[Dict[str, Any]]] ): The few shot example records with responses to be used to answer the question. -
guidelines (Optional[str] ): The guidelines for the annotation task. "},{"location":"components-gallery/tasks/argillalabeller/#outputs","title":"Outputs","text":" - suggestion (
Dict[str, Any] ): The final suggestion for annotation. "},{"location":"components-gallery/tasks/argillalabeller/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-the-same-dataset-and-question","title":"Annotate a record with the same dataset and question","text":"import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n dataset.records(\n query=rg.Query(filter=pending_records_filter),\n limit=5,\n )\n)\nexample_records = list(\n dataset.records(\n query=rg.Query(filter=completed_records_filter),\n limit=5,\n )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n fields=[field],\n question=question,\n example_records=example_records,\n guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n labeller.process(\n [\n {\n \"record\": record\n } for record in pending_records\n ]\n )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n "},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-alternating-datasets-and-questions","title":"Annotate a record with alternating datasets and questions","text":"import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n labeller.process(\n [\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question,\n },\n {\n \"record\": record,\n \"fields\": [field],\n \"question\": question2,\n }\n ]\n )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n "},{"location":"components-gallery/tasks/argillalabeller/#overwrite-default-prompts-and-instructions","title":"Overwrite default prompts and instructions","text":"import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n question_to_label_instruction={\n \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n \"text\": \"Provide a text response to the question.\",\n \"rating\": \"Provide a rating for the question.\",\n },\n)\nlabeller.load()\n "},{"location":"components-gallery/tasks/argillalabeller/#references","title":"References","text":" - Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets
"},{"location":"components-gallery/tasks/textclassification/","title":"TextClassification","text":"Classifies text into one or more categories or labels. This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference. "},{"location":"components-gallery/tasks/textclassification/#attributes","title":"Attributes","text":" -
system_prompt: A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist. -
n: Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1. -
context: Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task. -
examples: List of examples to help the model understand the task, few shots. -
available_labels: List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions. -
default_label: Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1). "},{"location":"components-gallery/tasks/textclassification/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[labels]\n OCOL1[model_name]\n end\n end\n\n subgraph TextClassification\n StepInput[Input Columns: text]\n StepOutput[Output Columns: labels, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textclassification/#inputs","title":"Inputs","text":" - text (
str ): The reference text we want to obtain labels for. "},{"location":"components-gallery/tasks/textclassification/#outputs","title":"Outputs","text":" -
labels (Union[str, List[str]] ): The label or list of labels for the text. -
model_name (str ): The name of the model used to generate the label/s. "},{"location":"components-gallery/tasks/textclassification/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclassification/#assigning-a-sentiment-to-a-text","title":"Assigning a sentiment to a text","text":"from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n llm=llm,\n context=\"You are an AI system specialized in assigning sentiment to movies.\",\n available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"positive\", # The text shows positive sentiment\\n \"negative\", # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#assigning-predefined-labels-with-specified-descriptions","title":"Assigning predefined labels with specified descriptions","text":"from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=1,\n context=\"Determine the intent of the text.\",\n available_labels={\n \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n },\n query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"Can you tell me more about your return policy?\"}]\n )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n \"complaint\", # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n \"inquiry\", # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n \"feedback\", # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n \"praise\", # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#free-multi-label-classification-without-predefined-labels","title":"Free multi label classification without predefined labels","text":"from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n llm=llm,\n n=3,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following type of persona.\"\n ),\n query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n text_classification.process(\n [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n# 'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n# {'role': 'user',\n# 'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/textclassification/#references","title":"References","text":" - Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models
"},{"location":"components-gallery/tasks/evolinstruct/","title":"EvolInstruct","text":"Evolve instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions "},{"location":"components-gallery/tasks/evolinstruct/#attributes","title":"Attributes","text":" -
num_evolutions: The number of evolutions to be performed. -
store_evolutions: Whether to store all the evolutions or just the last one. Defaults to False . -
generate_answers: Whether to generate answers for the evolved instructions. Defaults to False . -
include_original_instruction: Whether to include the original instruction in the evolved_instructions output column. Defaults to False . -
mutation_templates: The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolinstruct/#runtime-parameters","title":"Runtime Parameters","text":" - seed: The seed to be set for
numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolinstruct/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[evolved_instruction]\n OCOL1[evolved_instructions]\n OCOL2[model_name]\n OCOL3[answer]\n OCOL4[answers]\n end\n end\n\n subgraph EvolInstruct\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: evolved_instruction, evolved_instructions, model_name, answer, answers]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepOutput --> OCOL4\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolinstruct/#inputs","title":"Inputs","text":" - instruction (
str ): The instruction to evolve. "},{"location":"components-gallery/tasks/evolinstruct/#outputs","title":"Outputs","text":" -
evolved_instruction (str ): The evolved instruction if store_evolutions=False . -
evolved_instructions (List[str] ): The evolved instructions if store_evolutions=True . -
model_name (str ): The name of the LLM used to evolve the instructions. -
answer (str ): The answer to the evolved instruction if generate_answers=True and store_evolutions=False . -
answers (List[str] ): The answers to the evolved instructions if generate_answers=True and store_evolutions=True . "},{"location":"components-gallery/tasks/evolinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstruct/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n "},{"location":"components-gallery/tasks/evolinstruct/#keep-the-iterations-of-the-evolutions","title":"Keep the iterations of the evolutions","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instructions': ['initial evolution', 'final evolution'],\n# 'model_name': 'model_name'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolinstruct/#generate-answers-for-the-instructions-in-a-single-step","title":"Generate answers for the instructions in a single step","text":"from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'evolved_instruction': 'evolved instruction',\n# 'answer': 'answer to the instruction',\n# 'model_name': 'model_name'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolinstruct/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolcomplexity/","title":"EvolComplexity","text":"Evolve instructions to make them more complex using an LLM . EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach. "},{"location":"components-gallery/tasks/evolcomplexity/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolcomplexity/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The number of evolutions to be run. "},{"location":"components-gallery/tasks/evolcomplexity/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n end\n subgraph New columns\n OCOL0[evolved_instruction]\n OCOL1[answer]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolComplexity\n StepInput[Input Columns: instruction]\n StepOutput[Output Columns: evolved_instruction, answer, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolcomplexity/#inputs","title":"Inputs","text":" - instruction (
str ): The instruction to evolve. "},{"location":"components-gallery/tasks/evolcomplexity/#outputs","title":"Outputs","text":" -
evolved_instruction (str ): The evolved instruction. -
answer (str , optional): The answer to the instruction if generate_answers=True . -
model_name (str ): The name of the LLM used to evolve the instructions. "},{"location":"components-gallery/tasks/evolcomplexity/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexity/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n "},{"location":"components-gallery/tasks/evolcomplexity/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolquality/","title":"EvolQuality","text":"Evolve the quality of the responses using an LLM . EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/evolquality/#attributes","title":"Attributes","text":" -
num_evolutions: The number of evolutions to be performed on the responses. -
store_evolutions: Whether to store all the evolved responses or just the last one. Defaults to False . -
include_original_response: Whether to include the original response within the evolved responses. Defaults to False . -
mutation_templates: The mutation templates to be used to evolve the responses. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolquality/#runtime-parameters","title":"Runtime Parameters","text":" - seed: The seed to be set for
numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolquality/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[response]\n end\n subgraph New columns\n OCOL0[evolved_response]\n OCOL1[evolved_responses]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolQuality\n StepInput[Input Columns: instruction, response]\n StepOutput[Output Columns: evolved_response, evolved_responses, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/evolquality/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/evolquality/#outputs","title":"Outputs","text":" -
evolved_response (str ): The evolved response if store_evolutions=False . -
evolved_responses (List[str] ): The evolved responses if store_evolutions=True . -
model_name (str ): The name of the LLM used to evolve the responses. "},{"location":"components-gallery/tasks/evolquality/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolquality/#evolve-the-quality-of-the-responses-given-a-prompt","title":"Evolve the quality of the responses given a prompt","text":"from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n evol_quality.process(\n [\n {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'common instruction',\n# 'response': 'a response',\n# 'evolved_response': 'evolved response',\n# 'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n# }\n# ]\n "},{"location":"components-gallery/tasks/evolquality/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/evolinstructgenerator/","title":"EvolInstructGenerator","text":"Generate evolved instructions using an LLM . WizardLM: Empowering Large Language Models to Follow Complex Instructions "},{"location":"components-gallery/tasks/evolinstructgenerator/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolinstructgenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The seed to be set for numpy in order to randomly pick a mutation method. "},{"location":"components-gallery/tasks/evolinstructgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[instruction]\n OCOL1[answer]\n OCOL2[instructions]\n OCOL3[model_name]\n end\n end\n\n subgraph EvolInstructGenerator\n StepOutput[Output Columns: instruction, answer, instructions, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/evolinstructgenerator/#outputs","title":"Outputs","text":" -
instruction (str ): The generated instruction if generate_answers=False . -
answer (str ): The generated answer if generate_answers=True . -
instructions (List[str] ): The generated instructions if generate_answers=True . -
model_name (str ): The name of the LLM used to generate and evolve the instructions. "},{"location":"components-gallery/tasks/evolinstructgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstructgenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n "},{"location":"components-gallery/tasks/evolinstructgenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/","title":"EvolComplexityGenerator","text":"Generate evolved instructions with increased complexity using an LLM . EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#attributes","title":"Attributes","text":" -
num_instructions: The number of instructions to be generated. -
generate_answers: Whether to generate answers for the instructions or not. Defaults to False . -
mutation_templates: The mutation templates to be used for the generation of the instructions. -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512 . -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024 . -
seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42 . "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#runtime-parameters","title":"Runtime Parameters","text":" -
min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. -
max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. -
seed: The number of evolutions to be run. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[instruction]\n OCOL1[answer]\n OCOL2[model_name]\n end\n end\n\n subgraph EvolComplexityGenerator\n StepOutput[Output Columns: instruction, answer, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#outputs","title":"Outputs","text":" -
instruction (str ): The evolved instruction. -
answer (str , optional): The answer to the instruction if generate_answers=True . -
model_name (str ): The name of the LLM used to evolve the instructions. "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n "},{"location":"components-gallery/tasks/evolcomplexitygenerator/#references","title":"References","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/","title":"InstructionBacktranslation","text":"Self-Alignment with Instruction Backtranslation. "},{"location":"components-gallery/tasks/instructionbacktranslation/#attributes","title":"Attributes","text":" - _template: the Jinja2 template to use for the Instruction Backtranslation task.
"},{"location":"components-gallery/tasks/instructionbacktranslation/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n end\n subgraph New columns\n OCOL0[score]\n OCOL1[reason]\n OCOL2[model_name]\n end\n end\n\n subgraph InstructionBacktranslation\n StepInput[Input Columns: instruction, generation]\n StepOutput[Output Columns: score, reason, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/instructionbacktranslation/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#outputs","title":"Outputs","text":" -
score (str ): The score for the generation based on the given instruction. -
reason (str ): The reason for the provided score. -
model_name (str ): The model name used to score the generation. "},{"location":"components-gallery/tasks/instructionbacktranslation/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#generate-a-score-and-reason-for-a-given-instruction-and-generation","title":"Generate a score and reason for a given instruction and generation","text":"from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n name=\"instruction_backtranslation\",\n llm=llm,\n input_batch_size=10,\n output_mappings={\"model_name\": \"scoring_model\"},\n )\ninstruction_backtranslation.load()\n\nresult = next(\n instruction_backtranslation.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generation\": \"4\",\n }\n ]\n )\n)\n# result\n# [\n# {\n# \"instruction\": \"How much is 2+2?\",\n# \"generation\": \"4\",\n# \"score\": 3,\n# \"reason\": \"Reason for the generation.\",\n# \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n# }\n# ]\n "},{"location":"components-gallery/tasks/instructionbacktranslation/#references","title":"References","text":" - Self-Alignment with Instruction Backtranslation
"},{"location":"components-gallery/tasks/prometheuseval/","title":"PrometheusEval","text":"Critique and rank the quality of generations from an LLM using Prometheus 2.0. PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness , harmlessness , honesty , factual-validity , and reasoning , that can be overridden via rubrics , and the selected rubric is set via the attribute rubric . "},{"location":"components-gallery/tasks/prometheuseval/#note","title":"Note","text":"The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too. "},{"location":"components-gallery/tasks/prometheuseval/#attributes","title":"Attributes","text":" -
mode: the evaluation mode to use, either absolute or relative . It defines whether the task will evaluate one or two generations. -
rubric: the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness , harmlessness , honesty , factual-validity , or reasoning . Those will only work if using the default rubrics , otherwise, the provided rubrics should be used. -
rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness , harmlessness , honesty , factual-validity , and reasoning . -
reference: a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs. -
_template: a Jinja2 template used to format the input for the LLM. "},{"location":"components-gallery/tasks/prometheuseval/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generation]\n ICOL2[generations]\n ICOL3[reference]\n end\n subgraph New columns\n OCOL0[feedback]\n OCOL1[result]\n OCOL2[model_name]\n end\n end\n\n subgraph PrometheusEval\n StepInput[Input Columns: instruction, generation, generations, reference]\n StepOutput[Output Columns: feedback, result, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/prometheuseval/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction to use as reference. -
generation (str , optional): The generated text from the given instruction . This column is required if mode=absolute . -
generations (List[str] , optional): The generated texts from the given instruction . It should contain 2 generations only. This column is required if mode=relative . -
reference (str , optional): The reference / golden answer for the instruction , to be used by the LLM for comparison against. "},{"location":"components-gallery/tasks/prometheuseval/#outputs","title":"Outputs","text":" -
feedback (str ): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided. -
result (Union[int, Literal[\"A\", \"B\"]] ): If mode=absolute , then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative , then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B' . -
model_name (str ): The model name used to generate the feedback and result . "},{"location":"components-gallery/tasks/prometheuseval/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/prometheuseval/#critique-and-evaluate-llm-generation-quality-using-prometheus-2_0","title":"Critique and evaluate LLM generation quality using Prometheus 2_0","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-for-relative-evaluation","title":"Critique for relative evaluation","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"relative\",\n rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generations': ['something done', 'other thing'],\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 'something done',\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-with-a-custom-rubric","title":"Critique with a custom rubric","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"custom\",\n rubrics={\n \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n }\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\"instruction\": \"make something\", \"generation\": \"something done\"},\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#critique-using-a-reference-answer","title":"Critique using a reference answer","text":"from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n llm=vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n ),\n mode=\"absolute\",\n rubric=\"helpfulness\",\n reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n prometheus.process(\n [\n {\n \"instruction\": \"make something\",\n \"generation\": \"something done\",\n \"reference\": \"this is a reference answer\",\n },\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'make something',\n# 'generation': 'something done',\n# 'reference': 'this is a reference answer',\n# 'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n# 'feedback': 'the feedback',\n# 'result': 6,\n# }\n# ]\n "},{"location":"components-gallery/tasks/prometheuseval/#references","title":"References","text":""},{"location":"components-gallery/tasks/complexityscorer/","title":"ComplexityScorer","text":"Score instructions based on their complexity using an LLM . ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/complexityscorer/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/complexityscorer/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instructions]\n end\n subgraph New columns\n OCOL0[scores]\n OCOL1[model_name]\n end\n end\n\n subgraph ComplexityScorer\n StepInput[Input Columns: instructions]\n StepOutput[Output Columns: scores, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/complexityscorer/#inputs","title":"Inputs","text":" - instructions (
List[str] ): The list of instructions to be scored. "},{"location":"components-gallery/tasks/complexityscorer/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/complexityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/complexityscorer/#evaluate-the-complexity-of-your-instructions","title":"Evaluate the complexity of your instructions","text":"from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n "},{"location":"components-gallery/tasks/complexityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n \"scores\": [\\n 1, \\n 2\\n ]\\n}'}}]\n "},{"location":"components-gallery/tasks/complexityscorer/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/qualityscorer/","title":"QualityScorer","text":"Score responses based on their quality using an LLM . QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction. "},{"location":"components-gallery/tasks/qualityscorer/#attributes","title":"Attributes","text":" - _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/qualityscorer/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[responses]\n end\n subgraph New columns\n OCOL0[scores]\n OCOL1[model_name]\n end\n end\n\n subgraph QualityScorer\n StepInput[Input Columns: instruction, responses]\n StepOutput[Output Columns: scores, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/qualityscorer/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/qualityscorer/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/qualityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/qualityscorer/#evaluate-the-quality-of-your-instructions","title":"Evaluate the quality of your instructions","text":"from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n )\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n# result\n[\n {\n 'instructions': 'instruction',\n 'model_name': 'test',\n 'scores': [5, 3, 1],\n }\n]\n "},{"location":"components-gallery/tasks/qualityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n scorer.process(\n [\n {\n \"instruction\": \"instruction\",\n \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n }\n ]\n )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{ \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/qualityscorer/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/clair/","title":"CLAIR","text":"Contrastive Learning from AI Revisions (CLAIR). CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise. "},{"location":"components-gallery/tasks/clair/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n ICOL1[student_solution]\n end\n subgraph New columns\n OCOL0[revision]\n OCOL1[rational]\n OCOL2[model_name]\n end\n end\n\n subgraph CLAIR\n StepInput[Input Columns: task, student_solution]\n StepOutput[Output Columns: revision, rational, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/clair/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/clair/#outputs","title":"Outputs","text":" -
revision (str ): The revised text. -
rational (str ): The rational for the provided revision. -
model_name (str ): The name of the model used to generate the revision and rational. "},{"location":"components-gallery/tasks/clair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/clair/#create-contrastive-preference-pairs","title":"Create contrastive preference pairs","text":"from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 4096,\n },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n clair_task.process(\n [\n {\n \"task\": \"How many gaps are there between the earth and the moon?\",\n \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n }\n ]\n )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n# 'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n# {'role': 'user',\n# 'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/clair/#references","title":"References","text":""},{"location":"components-gallery/tasks/ultrafeedback/","title":"UltraFeedback","text":"Rank generations focusing on different aspects using an LLM . UltraFeedback: Boosting Language Models with High-quality Feedback. "},{"location":"components-gallery/tasks/ultrafeedback/#attributes","title":"Attributes","text":" - aspect: The aspect to perform with the
UltraFeedback model. The available aspects are: - helpfulness : Evaluate text outputs based on helpfulness. - honesty : Evaluate text outputs based on honesty. - instruction-following : Evaluate text outputs based on given instructions. - truthfulness : Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating : Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\" . "},{"location":"components-gallery/tasks/ultrafeedback/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[generations]\n end\n subgraph New columns\n OCOL0[ratings]\n OCOL1[rationales]\n OCOL2[model_name]\n end\n end\n\n subgraph UltraFeedback\n StepInput[Input Columns: instruction, generations]\n StepOutput[Output Columns: ratings, rationales, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/ultrafeedback/#inputs","title":"Inputs","text":""},{"location":"components-gallery/tasks/ultrafeedback/#outputs","title":"Outputs","text":" -
ratings (List[float] ): The ratings for each of the provided text outputs. -
rationales (List[str] ): The rationales for each of the provided text outputs. -
model_name (str ): The name of the model used to generate the ratings and rationales. "},{"location":"components-gallery/tasks/ultrafeedback/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-selected-aspect","title":"Rate generations from different LLMs based on the selected aspect","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n ),\n use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [\n# {\n# 'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 2],\n# 'rationales': ['explanation for 4', 'explanation for and a car'],\n# 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n# }\n# ]\n "},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-honesty-using-the-default-structured-output","title":"Rate generations from different LLMs based on the honesty, using the default structured output","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n 5,\\n 1\\n] \\n\\n,\"rationales\": [\\n \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-helpfulness-using-the-default-structured-output","title":"Rate generations from different LLMs based on the helpfulness, using the default structured output","text":"from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\"max_new_tokens\": 512},\n ),\n aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n ultrafeedback.process(\n [\n {\n \"instruction\": \"How much is 2+2?\",\n \"generations\": [\"4\", \"and a car\"],\n }\n ]\n )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [1, 5],\n# 'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n# 'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n# 'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n# 'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n# 'types': [1, 3, 1],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n \"ratings\": [\\n 1,\\n 5\\n ]\\n ,\\n \"rationales\": [\\n \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n ]\\n ,\\n \"rationales_for_rating\": [\\n \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n ]\\n ,\\n \"types\": [\\n 1, 3,\\n 1\\n ]\\n }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/ultrafeedback/#references","title":"References","text":""},{"location":"components-gallery/tasks/pairrm/","title":"PairRM","text":"Rank the candidates based on the input using the LLM model. "},{"location":"components-gallery/tasks/pairrm/#note","title":"Note","text":"This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM . "},{"location":"components-gallery/tasks/pairrm/#attributes","title":"Attributes","text":""},{"location":"components-gallery/tasks/pairrm/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[inputs]\n ICOL1[candidates]\n end\n subgraph New columns\n OCOL0[ranks]\n OCOL1[ranked_candidates]\n OCOL2[model_name]\n end\n end\n\n subgraph PairRM\n StepInput[Input Columns: inputs, candidates]\n StepOutput[Output Columns: ranks, ranked_candidates, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/pairrm/#inputs","title":"Inputs","text":" -
inputs (List[Dict[str, Any]] ): The input text or conversation to rank the candidates for. -
candidates (List[Dict[str, Any]] ): The candidates to rank. "},{"location":"components-gallery/tasks/pairrm/#outputs","title":"Outputs","text":" -
ranks (List[int] ): The ranks of the candidates based on the input. -
ranked_candidates (List[Dict[str, Any]] ): The candidates ranked based on the input. -
model_name (str ): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\" . "},{"location":"components-gallery/tasks/pairrm/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/pairrm/#rank-llm-candidates","title":"Rank LLM candidates","text":"from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n scorer.process(\n [\n {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n ]\n )\n)\n# result\n# [\n# {\n# 'input': 'Hello, how are you?',\n# 'candidates': ['fine', 'good', 'bad'],\n# 'ranks': [2, 1, 3],\n# 'ranked_candidates': ['good', 'fine', 'bad'],\n# 'model_name': 'llm-blender/PairRM',\n# }\n# ]\n "},{"location":"components-gallery/tasks/pairrm/#references","title":"References","text":""},{"location":"components-gallery/tasks/generatesentencepair/","title":"GenerateSentencePair","text":"Generate a positive and negative (optionally) sentences given an anchor sentence. GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models. "},{"location":"components-gallery/tasks/generatesentencepair/#attributes","title":"Attributes","text":" -
triplet: a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False . -
action: the action to perform to generate the positive sentence. -
context: the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default. -
hard_negative: A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity. "},{"location":"components-gallery/tasks/generatesentencepair/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[anchor]\n end\n subgraph New columns\n OCOL0[positive]\n OCOL1[negative]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateSentencePair\n StepInput[Input Columns: anchor]\n StepOutput[Output Columns: positive, negative, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatesentencepair/#inputs","title":"Inputs","text":" - anchor (
str ): The anchor sentence to generate the positive and negative sentences. "},{"location":"components-gallery/tasks/generatesentencepair/#outputs","title":"Outputs","text":" -
positive (str ): The positive sentence related to the anchor . -
negative (str ): The negative sentence unrelated to the anchor if triplet=True , or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True . -
model_name (str ): The name of the model that was used to generate the sentences. "},{"location":"components-gallery/tasks/generatesentencepair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatesentencepair/#paraphrasing","title":"Paraphrasing","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"paraphrase\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-semantically-similar-sentences","title":"Generating semantically similar sentences","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"semantically-similar\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-queries","title":"Generating queries","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#generating-answers","title":"Generating answers","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"answer\",\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n "},{"location":"components-gallery/tasks/generatesentencepair/#_1","title":")","text":"from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n triplet=True, # `False` to generate only positive\n action=\"query\",\n context=\"Argilla is an open-source data curation platform for LLMs.\",\n hard_negative=True,\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n input_batch_size=10,\n use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n "},{"location":"components-gallery/tasks/generateembeddings/","title":"GenerateEmbeddings","text":"Generate embeddings using the last hidden state of an LLM . Generate embeddings for a text input using the last hidden state of an LLM , as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. "},{"location":"components-gallery/tasks/generateembeddings/#attributes","title":"Attributes","text":" - llm: The
LLM to use to generate the embeddings. "},{"location":"components-gallery/tasks/generateembeddings/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n end\n subgraph New columns\n OCOL0[embedding]\n OCOL1[model_name]\n end\n end\n\n subgraph GenerateEmbeddings\n StepInput[Input Columns: text]\n StepOutput[Output Columns: embedding, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generateembeddings/#inputs","title":"Inputs","text":" - text (
str , List[Dict[str, str]] ): The input text or conversation to generate embeddings for. "},{"location":"components-gallery/tasks/generateembeddings/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/generateembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateembeddings/#rank-llm-candidates","title":"Rank LLM candidates","text":"from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n llm=TransformersLLM(\n model=\"TaylorAI/bge-micro-v2\",\n model_kwargs={\"is_decoder\": True},\n cuda_devices=[],\n )\n)\nembedder.load()\n\nresult = next(\n embedder.process(\n [\n {\"text\": \"Hello, how are you?\"},\n ]\n )\n)\n "},{"location":"components-gallery/tasks/generateembeddings/#references","title":"References","text":" - What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/textclustering/","title":"TextClustering","text":"Task that clusters a set of texts and generates summary labels for each cluster. This is a GlobalTask that inherits from TextClassification , this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering "},{"location":"components-gallery/tasks/textclustering/#attributes","title":"Attributes","text":" - savefig: Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.
"},{"location":"components-gallery/tasks/textclustering/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[text]\n ICOL1[projection]\n ICOL2[cluster_label]\n end\n subgraph New columns\n OCOL0[summary_label]\n OCOL1[model_name]\n end\n end\n\n subgraph TextClustering\n StepInput[Input Columns: text, projection, cluster_label]\n StepOutput[Output Columns: summary_label, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/textclustering/#inputs","title":"Inputs","text":" -
text (str ): The reference text we want to obtain labels for. -
projection (List[float] ): Vector representation of the text to cluster, normally the output from the UMAP step. -
cluster_label (int ): Integer representing the label of a given cluster. -1 means it wasn't clustered. "},{"location":"components-gallery/tasks/textclustering/#outputs","title":"Outputs","text":""},{"location":"components-gallery/tasks/textclustering/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclustering/#generate-labels-for-a-set-of-texts-using-clustering","title":"Generate labels for a set of texts using clustering","text":"from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n batch_size = 500\n\n ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n umap = UMAP(n_components=2, metric=\"cosine\")\n dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n text_clustering = TextClustering(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n ),\n n=3, # 3 labels per example\n query_title=\"Examples of Personas\",\n samples_per_cluster=10,\n context=(\n \"Describe the main themes, topics, or categories that could describe the \"\n \"following types of personas. All the examples of personas must share \"\n \"the same set of labels.\"\n ),\n default_label=\"None\",\n savefig=True,\n input_batch_size=8,\n input_mappings={\"text\": \"persona\"},\n use_default_structured_output=True,\n )\n\n loader >> umap >> dbscan >> text_clustering\n "},{"location":"components-gallery/tasks/textclustering/#references","title":"References","text":" - text-clustering repository
"},{"location":"components-gallery/tasks/apigensemanticchecker/","title":"APIGenSemanticChecker","text":"Generate queries and answers for the given functions in JSON format. The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format. "},{"location":"components-gallery/tasks/apigensemanticchecker/#attributes","title":"Attributes","text":" -
system_prompt: System prompt for the task. Has a default one. -
exclude_failed_execution: Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker ). Defaults to True. "},{"location":"components-gallery/tasks/apigensemanticchecker/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[func_desc]\n ICOL1[query]\n ICOL2[answers]\n ICOL3[execution_result]\n end\n subgraph New columns\n OCOL0[thought]\n OCOL1[keep_row_after_semantic_check]\n end\n end\n\n subgraph APIGenSemanticChecker\n StepInput[Input Columns: func_desc, query, answers, execution_result]\n StepOutput[Output Columns: thought, keep_row_after_semantic_check]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n ICOL2 --> StepInput\n ICOL3 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#inputs","title":"Inputs","text":" -
func_desc (str ): Description of what the function should do. -
query (str ): Instruction from the user. -
answers (str ): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads . -
execution_result (str ): Result of the function/API executed. "},{"location":"components-gallery/tasks/apigensemanticchecker/#outputs","title":"Outputs","text":" -
thought (str ): Reasoning for the output on whether to keep this output or not. -
keep_row_after_semantic_check (bool ): True or False, can be used to filter afterwards. "},{"location":"components-gallery/tasks/apigensemanticchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-original-implementation","title":"Semantic checker for generated function calls (original implementation)","text":"from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=False,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n \"thought\": \"Concisely describe your reasoning here\",\\n \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-structured-output","title":"Semantic checker for generated function calls (structured output)","text":"from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n generation_kwargs={\n \"temperature\": 0.7,\n \"max_new_tokens\": 1024,\n },\n)\nsemantic_checker = APIGenSemanticChecker(\n use_default_structured_output=True,\n llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n semantic_checker.process(\n [\n {\n \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n }\n ]\n )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n# 'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n# {'role': 'user',\n# 'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n "},{"location":"components-gallery/tasks/apigensemanticchecker/#references","title":"References","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/","title":"GenerateTextRetrievalData","text":"Generate text retrieval data with an LLM to later on train an embedding model. GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
query_type: The type of query to be generated, which can be extremely long-tail , long-tail , or common . Defaults to None , meaning that it will be randomly sampled. -
query_length: The length of the query to be generated, which can be less than 5 words , 5 to 15 words , or at least 10 words . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. -
clarity: The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. -
num_words: The number of words in the query to be generated, which can be 50 , 100 , 200 , 300 , 400 , or 500 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[user_query]\n OCOL1[positive_document]\n OCOL2[hard_negative_document]\n OCOL3[model_name]\n end\n end\n\n subgraph GenerateTextRetrievalData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: user_query, positive_document, hard_negative_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatetextretrievaldata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#outputs","title":"Outputs","text":" -
user_query (str ): the user query generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
hard_negative_document (str ): the hard negative document generated by the LLM . -
model_name (str ): the name of the model used to generate the text retrieval data. "},{"location":"components-gallery/tasks/generatetextretrievaldata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/#generate-synthetic-text-retrieval-data-for-training-embedding-models","title":"Generate synthetic text retrieval data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextRetrievalData(\n language=\"English\",\n query_type=\"common\",\n query_length=\"5 to 15 words\",\n difficulty=\"high school\",\n clarity=\"clear\",\n num_words=100,\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatetextretrievaldata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/","title":"GenerateShortTextMatchingData","text":"Generate short text matching data with an LLM to later on train an embedding model. GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[positive_document]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateShortTextMatchingData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input, positive_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#outputs","title":"Outputs","text":" -
input (str ): the input generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
model_name (str ): the name of the model used to generate the short text matching data. "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#generate-synthetic-short-text-matching-data-for-training-embedding-models","title":"Generate synthetic short text matching data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-short\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateShortTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/","title":"GenerateLongTextMatchingData","text":"Generate long text matching data with an LLM to later on train an embedding model. GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input]\n OCOL1[positive_document]\n OCOL2[model_name]\n end\n end\n\n subgraph GenerateLongTextMatchingData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input, positive_document, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#outputs","title":"Outputs","text":" -
input (str ): the input generated by the LLM . -
positive_document (str ): the positive document generated by the LLM . -
model_name (str ): the name of the model used to generate the long text matching data. "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#generate-synthetic-long-text-matching-data-for-training-embedding-models","title":"Generate synthetic long text matching data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-matching-long\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateLongTextMatchingData(\n language=\"English\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatetextclassificationdata/","title":"GenerateTextClassificationData","text":"Generate text classification data with an LLM to later on train an embedding model. GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#note","title":"Note","text":"Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\" ; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
difficulty: The difficulty of the query to be generated, which can be high school , college , or PhD . Defaults to None , meaning that it will be randomly sampled. -
clarity: The clarity of the query to be generated, which can be clear , understandable with some effort , or ambiguous . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[task]\n end\n subgraph New columns\n OCOL0[input_text]\n OCOL1[label]\n OCOL2[misleading_label]\n OCOL3[model_name]\n end\n end\n\n subgraph GenerateTextClassificationData\n StepInput[Input Columns: task]\n StepOutput[Output Columns: input_text, label, misleading_label, model_name]\n end\n\n ICOL0 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/generatetextclassificationdata/#inputs","title":"Inputs","text":" - task (
str ): The task description to be used in the generation. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#outputs","title":"Outputs","text":" -
input_text (str ): the input text generated by the LLM . -
label (str ): the label generated by the LLM . -
misleading_label (str ): the misleading label generated by the LLM . -
model_name (str ): the name of the model used to generate the text classification data. "},{"location":"components-gallery/tasks/generatetextclassificationdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextclassificationdata/#generate-synthetic-text-classification-data-for-training-embedding-models","title":"Generate synthetic text classification data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-classification\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n generate = GenerateTextClassificationData(\n language=\"English\",\n difficulty=\"high school\",\n clarity=\"clear\",\n llm=..., # LLM instance\n )\n\n task >> generate\n "},{"location":"components-gallery/tasks/generatetextclassificationdata/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/structuredgeneration/","title":"StructuredGeneration","text":"Generate structured content for a given instruction using an LLM . StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction . The model_name also returned as part of the output in order to enhance it. "},{"location":"components-gallery/tasks/structuredgeneration/#attributes","title":"Attributes","text":" - use_system_prompt: Whether to use the system prompt in the generation. Defaults to
True , which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored. "},{"location":"components-gallery/tasks/structuredgeneration/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph Columns\n ICOL0[instruction]\n ICOL1[structured_output]\n end\n subgraph New columns\n OCOL0[generation]\n OCOL1[model_name]\n end\n end\n\n subgraph StructuredGeneration\n StepInput[Input Columns: instruction, structured_output]\n StepOutput[Output Columns: generation, model_name]\n end\n\n ICOL0 --> StepInput\n ICOL1 --> StepInput\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepInput --> StepOutput\n "},{"location":"components-gallery/tasks/structuredgeneration/#inputs","title":"Inputs","text":" -
instruction (str ): The instruction to generate structured content from. -
structured_output (Dict[str, Any] ): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema , where format should be one of json or regex , and the schema should be either the JSON schema or the regex pattern, respectively. "},{"location":"components-gallery/tasks/structuredgeneration/#outputs","title":"Outputs","text":" -
generation (str ): The generated text matching the provided schema, if possible. -
model_name (str ): The name of the model used to generate the text. "},{"location":"components-gallery/tasks/structuredgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-json-schema","title":"Generate structured output from a JSON schema","text":"from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"Create an RPG character\",\n \"structured_output\": {\n \"format\": \"json\",\n \"schema\": {\n \"properties\": {\n \"name\": {\n \"title\": \"Name\",\n \"type\": \"string\"\n },\n \"description\": {\n \"title\": \"Description\",\n \"type\": \"string\"\n },\n \"role\": {\n \"title\": \"Role\",\n \"type\": \"string\"\n },\n \"weapon\": {\n \"title\": \"Weapon\",\n \"type\": \"string\"\n }\n },\n \"required\": [\n \"name\",\n \"description\",\n \"role\",\n \"weapon\"\n ],\n \"title\": \"Character\",\n \"type\": \"object\"\n }\n },\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-regex-pattern-only-works-with-llms-that-support-regex-the-providers-using-outlines","title":"Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines)","text":"from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n)\n\nstructured_gen.load()\n\nresult = next(\n structured_gen.process(\n [\n {\n \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n \"structured_output\": {\n \"format\": \"regex\",\n \"schema\": r\"(\\d{1,2})\u00b0C\"\n },\n\n }\n ]\n )\n)\n "},{"location":"components-gallery/tasks/monolingualtripletgenerator/","title":"MonolingualTripletGenerator","text":"Generate monolingual triplets with an LLM to later on train an embedding model. MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#attributes","title":"Attributes","text":" -
language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
unit: The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. -
high_score: The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. -
low_score: The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[S1]\n OCOL1[S2]\n OCOL2[S3]\n OCOL3[model_name]\n end\n end\n\n subgraph MonolingualTripletGenerator\n StepOutput[Output Columns: S1, S2, S3, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#outputs","title":"Outputs","text":" -
S1 (str ): the first sentence generated by the LLM . -
S2 (str ): the second sentence generated by the LLM . -
S3 (str ): the third sentence generated by the LLM . -
model_name (str ): the name of the model used to generate the monolingual triplets. "},{"location":"components-gallery/tasks/monolingualtripletgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/monolingualtripletgenerator/#generate-monolingual-triplets-for-training-embedding-models","title":"Generate monolingual triplets for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = MonolingualTripletGenerator(\n language=\"English\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/bitextretrievalgenerator/","title":"BitextRetrievalGenerator","text":"Generate bitext retrieval data with an LLM to later on train an embedding model. BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#attributes","title":"Attributes","text":" -
source_language: The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
target_language: The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf. -
unit: The unit of the data to be generated, which can be sentence , phrase , or passage . Defaults to None , meaning that it will be randomly sampled. -
difficulty: The difficulty of the query to be generated, which can be elementary school , high school , or college . Defaults to None , meaning that it will be randomly sampled. -
high_score: The high score of the query to be generated, which can be 4 , 4.5 , or 5 . Defaults to None , meaning that it will be randomly sampled. -
low_score: The low score of the query to be generated, which can be 2.5 , 3 , or 3.5 . Defaults to None , meaning that it will be randomly sampled. -
seed: The random seed to be set in case there's any sampling within the format_input method. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[S1]\n OCOL1[S2]\n OCOL2[S3]\n OCOL3[model_name]\n end\n end\n\n subgraph BitextRetrievalGenerator\n StepOutput[Output Columns: S1, S2, S3, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n StepOutput --> OCOL3\n "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#outputs","title":"Outputs","text":" -
S1 (str ): the first sentence generated by the LLM . -
S2 (str ): the second sentence generated by the LLM . -
S3 (str ): the third sentence generated by the LLM . -
model_name (str ): the name of the model used to generate the bitext retrieval data. "},{"location":"components-gallery/tasks/bitextretrievalgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/bitextretrievalgenerator/#generate-bitext-retrieval-data-for-training-embedding-models","title":"Generate bitext retrieval data for training embedding models","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = BitextRetrievalGenerator(\n source_language=\"English\",\n target_language=\"Spanish\",\n unit=\"sentence\",\n difficulty=\"elementary school\",\n high_score=\"4\",\n low_score=\"2.5\",\n llm=...,\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/","title":"EmbeddingTaskGenerator","text":"Generate task descriptions for embedding-related tasks using an LLM . EmbeddingTaskGenerator is a GeneratorTask that doesn't receieve any input besides the provided attributes that generates task descriptions for embedding-related tasks using a pre-defined prompt based on the category attribute. The category attribute should be one of the following: - `text-retrieval`: Generate task descriptions for text retrieval tasks.\n- `text-matching-short`: Generate task descriptions for short text matching tasks.\n- `text-matching-long`: Generate task descriptions for long text matching tasks.\n- `text-classification`: Generate task descriptions for text classification tasks.\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#attributes","title":"Attributes","text":" -
category: The category of the task to be generated, which can either be text-retrieval , text-matching-short , text-matching-long , or text-classification . -
flatten_tasks: Whether to flatten the tasks i.e. since a list of tasks is generated by the LLM , this attribute indicates whether to flatten the list or not. Defaults to False , meaning that running this task with num_generations=1 will return a distilabel.Distiset with one row only containing a list with around 20 tasks; otherwise, if set to True , it will return a distilabel.Distiset with around 20 rows, each containing one task. "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#input-output-columns","title":"Input & Output Columns","text":"graph TD\n subgraph Dataset\n subgraph New columns\n OCOL0[tasks]\n OCOL1[task]\n OCOL2[model_name]\n end\n end\n\n subgraph EmbeddingTaskGenerator\n StepOutput[Output Columns: tasks, task, model_name]\n end\n\n StepOutput --> OCOL0\n StepOutput --> OCOL1\n StepOutput --> OCOL2\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#outputs","title":"Outputs","text":" -
tasks (List[str] ): the list of tasks generated by the LLM . -
task (str ): the task generated by the LLM if flatten_tasks=True . -
model_name (str ): the name of the model used to generate the tasks. "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/embeddingtaskgenerator/#generate-embedding-tasks-for-text-retrieval","title":"Generate embedding tasks for text retrieval","text":"from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n task = EmbeddingTaskGenerator(\n category=\"text-retrieval\",\n flatten_tasks=True,\n llm=..., # LLM instance\n )\n\n ...\n\n task >> ...\n "},{"location":"components-gallery/tasks/embeddingtaskgenerator/#references","title":"References","text":" - Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/llms/","title":"LLMs Gallery","text":" -
AnthropicLLM Anthropic LLM implementation running the Async API client. AnthropicLLM -
OpenAILLM OpenAI LLM implementation running the async API client. OpenAILLM -
AnyscaleLLM Anyscale LLM implementation running the async API client of OpenAI. AnyscaleLLM -
AzureOpenAILLM Azure OpenAI LLM implementation running the async API client. AzureOpenAILLM -
TogetherLLM TogetherLLM LLM implementation running the async API client of OpenAI. TogetherLLM -
ClientvLLM A client for the vLLM server implementing the OpenAI API specification. ClientvLLM -
CohereLLM Cohere API implementation using the async client for concurrent text generation. CohereLLM -
GroqLLM Groq API implementation using the async client for concurrent text generation. GroqLLM -
InferenceEndpointsLLM InferenceEndpoints LLM implementation running the async API client. InferenceEndpointsLLM -
LiteLLM LiteLLM implementation running the async API client. LiteLLM -
MistralLLM Mistral LLM implementation running the async API client. MistralLLM -
MixtureOfAgentsLLM Mixture-of-Agents implementation. MixtureOfAgentsLLM -
OllamaLLM Ollama LLM implementation running the Async API client. OllamaLLM -
VertexAILLM VertexAI LLM implementation running the async API clients for Gemini. VertexAILLM -
TransformersLLM Hugging Face transformers library LLM implementation using the text generation TransformersLLM -
LlamaCppLLM llama.cpp LLM implementation running the Python bindings for the C++ code. LlamaCppLLM -
vLLM vLLM library LLM implementation. vLLM "},{"location":"components-gallery/llms/anthropicllm/","title":"AnthropicLLM","text":"Anthropic LLM implementation running the Async API client. "},{"location":"components-gallery/llms/anthropicllm/#attributes","title":"Attributes","text":" -
model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview. -
api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. -
base_url: the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally. -
timeout: the maximum time in seconds to wait for a response. Defaults to 600.0 . -
max_retries: The maximum number of times to retry the request before failing. Defaults to 6 . -
http_client: if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. -
_aclient: the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/anthropicllm/#runtime-parameters","title":"Runtime Parameters","text":" -
api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable. -
base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\" . -
timeout: the maximum time in seconds to wait for a response. Defaults to 600.0 . -
max_retries: the maximum number of times to retry the request before failing. Defaults to 6 . "},{"location":"components-gallery/llms/anthropicllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anthropicllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/anthropicllm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AnthropicLLM(\n model=\"claude-3-opus-20240229\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/openaillm/","title":"OpenAILLM","text":"OpenAI LLM implementation running the async API client. "},{"location":"components-gallery/llms/openaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here. -
base_url: the base URL to use for the OpenAI API requests. Defaults to None , which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set. -
api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . "},{"location":"components-gallery/llms/openaillm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the OpenAI API requests. Defaults to None . -
api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . "},{"location":"components-gallery/llms/openaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/openaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = OpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/openaillm/#generate-with-batch-api-offline-batch-generation","title":"Generate with Batch API (offline batch generation)","text":"from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n model=\"gpt-3.5-turbo\",\n use_offline_batch_generation=True,\n offline_batch_generation_block_until_done=5, # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n "},{"location":"components-gallery/llms/anyscalellm/","title":"AnyscaleLLM","text":"Anyscale LLM implementation running the async API client of OpenAI. "},{"location":"components-gallery/llms/anyscalellm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM, e.g., google/gemma-7b-it . See the supported models under the \"Text Generation -> Supported Models\" section here. -
base_url: the base URL to use for the Anyscale API requests. Defaults to None , which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set. -
api_key: the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set. -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. "},{"location":"components-gallery/llms/anyscalellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anyscalellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/","title":"AzureOpenAILLM","text":"Azure OpenAI LLM implementation running the async API client. "},{"location":"components-gallery/llms/azureopenaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM i.e. the name of the Azure deployment. -
base_url: the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT . Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set. -
api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set. -
api_version: the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set. "},{"location":"components-gallery/llms/azureopenaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/azureopenaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/azureopenaillm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = AzureOpenAILLM(\n model=\"gpt-4-turbo\",\n api_key=\"api.key\",\n structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/togetherllm/","title":"TogetherLLM","text":"TogetherLLM LLM implementation running the async API client of OpenAI. "},{"location":"components-gallery/llms/togetherllm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here. -
base_url: the base URL to use for the Together API can be set with TOGETHER_BASE_URL . Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set. -
api_key: the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set. -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. "},{"location":"components-gallery/llms/togetherllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/togetherllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/clientvllm/","title":"ClientvLLM","text":"A client for the vLLM server implementing the OpenAI API specification. "},{"location":"components-gallery/llms/clientvllm/#attributes","title":"Attributes","text":" -
base_url: the base URL of the vLLM server. Defaults to \"http://localhost:8000\" . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . -
tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None . -
tokenizer_revision: the revision of the tokenizer to load. Defaults to None . -
_aclient: the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None . "},{"location":"components-gallery/llms/clientvllm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\" . -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None . "},{"location":"components-gallery/llms/clientvllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/clientvllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n base_url=\"http://localhost:8000/v1\",\n tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n temperature=0.7,\n top_p=1.0,\n max_new_tokens=256,\n)\n# [\n# [\n# \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n# \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n# \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n# ]\n# ]\n "},{"location":"components-gallery/llms/coherellm/","title":"CohereLLM","text":"Cohere API implementation using the async client for concurrent text generation. "},{"location":"components-gallery/llms/coherellm/#attributes","title":"Attributes","text":" -
model: the name of the model from the Cohere API to use for the generation. -
base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . -
api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
client_name: the name of the client to use for the API requests. Defaults to \"distilabel\" . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_ChatMessage: the ChatMessage class from the cohere package. -
_aclient: the AsyncClient client from the cohere package. "},{"location":"components-gallery/llms/coherellm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\" . -
api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable. -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
client_name: the name of the client to use for the API requests. Defaults to \"distilabel\" . "},{"location":"components-gallery/llms/coherellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/coherellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/groqllm/","title":"GroqLLM","text":"Groq API implementation using the async client for concurrent text generation. "},{"location":"components-gallery/llms/groqllm/#attributes","title":"Attributes","text":" -
model: the name of the model from the Groq API to use for the generation. -
base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . -
api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. -
_aclient: the AsyncGroq client from the groq package. "},{"location":"components-gallery/llms/groqllm/#runtime-parameters","title":"Runtime Parameters","text":" -
base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\" . -
api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable. -
max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2 . -
timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120 . "},{"location":"components-gallery/llms/groqllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/groqllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/inferenceendpointsllm/","title":"InferenceEndpointsLLM","text":"InferenceEndpoints LLM implementation running the async API client. This LLM will internally use huggingface_hub.AsyncInferenceClient . "},{"location":"components-gallery/llms/inferenceendpointsllm/#attributes","title":"Attributes","text":" -
model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None . -
endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to None . -
endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to None . -
base_url: the base URL to use for the Inference Endpoints API requests. -
api_key: the API key to authenticate the requests to the Inference Endpoints API. -
tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None , but defining one is recommended to properly format the prompt. -
model_display_name: the model display name to use for the LLM. Defaults to None . -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. "},{"location":"components-gallery/llms/inferenceendpointsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/inferenceendpointsllm/#free-serverless-inference-api-set-the-input_batch_size-of-the-task-that-uses-this-to-avoid-model-is-overloaded","title":"Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints","title":"Dedicated Inference Endpoints","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n endpoint_name=\"<ENDPOINT_NAME>\",\n api_key=\"<HF_API_KEY>\",\n endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints-or-tgi","title":"Dedicated Inference Endpoints or TGI","text":"from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n api_key=\"<HF_API_KEY>\",\n base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/inferenceendpointsllm/#generate-structured-data","title":"Generate structured data","text":"from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n api_key=\"api.key\",\n structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n "},{"location":"components-gallery/llms/litellm/","title":"LiteLLM","text":"LiteLLM implementation running the async API client. "},{"location":"components-gallery/llms/litellm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc. -
verbose: whether to log the LiteLLM client's logs. Defaults to False . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . "},{"location":"components-gallery/llms/litellm/#runtime-parameters","title":"Runtime Parameters","text":" - verbose: whether to log the LiteLLM client's logs. Defaults to
False . "},{"location":"components-gallery/llms/litellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/litellm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/mistralllm/","title":"MistralLLM","text":"Mistral LLM implementation running the async API client. "},{"location":"components-gallery/llms/mistralllm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc. -
endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\". -
api_key: the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set. -
max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5 . -
timeout: the maximum time in seconds to wait for a response. Defaults to 120 . -
max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64 . -
structured_output: a dictionary containing the structured output configuration configuration using instructor . You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor . -
_api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally. -
_aclient: the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/mistralllm/#runtime-parameters","title":"Runtime Parameters","text":" -
api_key: the API key to authenticate the requests to the Mistral API. -
max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5 . -
timeout: the maximum time in seconds to wait for a response. Defaults to 120 . -
max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64 . "},{"location":"components-gallery/llms/mistralllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mistralllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n "},{"location":"components-gallery/llms/mixtureofagentsllm/","title":"MixtureOfAgentsLLM","text":"Mixture-of-Agents implementation. An LLM class that leverages LLM s collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLM s proposing/generating outputs that LLM s from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response. "},{"location":"components-gallery/llms/mixtureofagentsllm/#attributes","title":"Attributes","text":" -
aggregator_llm: The LLM that aggregates the outputs of the proposer LLM s. -
proposers_llms: The list of LLM s that propose outputs to be aggregated. -
rounds: The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1 . "},{"location":"components-gallery/llms/mixtureofagentsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mixtureofagentsllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n aggregator_llm=InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n proposers_llms=[\n InferenceEndpointsLLM(\n model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n ),\n InferenceEndpointsLLM(\n model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n ),\n InferenceEndpointsLLM(\n model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n ),\n ],\n rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n inputs=[\n [\n {\n \"role\": \"user\",\n \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n }\n ]\n ]\n)\n "},{"location":"components-gallery/llms/mixtureofagentsllm/#references","title":"References","text":" - Mixture-of-Agents Enhances Large Language Model Capabilities
"},{"location":"components-gallery/llms/ollamallm/","title":"OllamaLLM","text":"Ollama LLM implementation running the Async API client. "},{"location":"components-gallery/llms/ollamallm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"notus\". -
host: the Ollama server host. -
timeout: the timeout for the LLM. Defaults to 120 . -
_aclient: the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/ollamallm/#runtime-parameters","title":"Runtime Parameters","text":""},{"location":"components-gallery/llms/ollamallm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/ollamallm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/vertexaillm/","title":"VertexAILLM","text":"VertexAI LLM implementation running the async API clients for Gemini. -
Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods: - Setting
GOOGLE_CLOUD_CREDENTIALS environment variable - Using
gcloud auth application-default login command - Using
vertexai.init function from the google-cloud-aiplatform library "},{"location":"components-gallery/llms/vertexaillm/#attributes","title":"Attributes","text":" -
model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models. -
_aclient: the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method. "},{"location":"components-gallery/llms/vertexaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vertexaillm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/transformersllm/","title":"TransformersLLM","text":"Hugging Face transformers library LLM implementation using the text generation pipeline. "},{"location":"components-gallery/llms/transformersllm/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\" . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
model_kwargs: additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model. -
tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None . -
use_fast: whether to use a fast tokenizer or not. Defaults to True . -
chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . -
device: the name or index of the device where the model will be loaded. Defaults to None . -
device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\" . Defaults to None . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . "},{"location":"components-gallery/llms/transformersllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/transformersllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/","title":"LlamaCppLLM","text":"llama.cpp LLM implementation running the Python bindings for the C++ code. "},{"location":"components-gallery/llms/llamacppllm/#attributes","title":"Attributes","text":" -
model_path: contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings. -
n_gpu_layers: the number of layers to use for the GPU. Defaults to -1 , meaning that the available GPU device will be used. -
chat_format: the chat format to use for the model. Defaults to None , which means the Llama format will be used. -
n_ctx: the context size to use for the model. Defaults to 512 . -
n_batch: the prompt processing maximum batch size to use for the model. Defaults to 512 . -
seed: random seed to use for the generation. Defaults to 4294967295 . -
verbose: whether to print verbose output. Defaults to False . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . -
_model: the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. "},{"location":"components-gallery/llms/llamacppllm/#runtime-parameters","title":"Runtime Parameters","text":" -
model_path: the path to the GGUF quantized model. -
n_gpu_layers: the number of layers to use for the GPU. Defaults to -1 . -
chat_format: the chat format to use for the model. Defaults to None . -
verbose: whether to print verbose output. Defaults to False . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {} . "},{"location":"components-gallery/llms/llamacppllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/llamacppllm/#generate-text","title":"Generate text","text":"from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path),\n n_gpu_layers=-1, # To use the GPU if available\n n_ctx=1024, # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/#generate-structured-data","title":"Generate structured data","text":"from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = LlamaCppLLM(\n model_path=str(Path.home() / model_path), # type: ignore\n n_gpu_layers=-1,\n n_ctx=1024,\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/llms/llamacppllm/#references","title":"References","text":" -
llama.cpp -
llama-cpp-python "},{"location":"components-gallery/llms/vllm/","title":"vLLM","text":"vLLM library LLM implementation. "},{"location":"components-gallery/llms/vllm/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
dtype: the data type to use for the model. Defaults to auto . -
trust_remote_code: whether to trust the remote code when loading the model. Defaults to False . -
quantization: the quantization mode to use for the model. Defaults to None . -
revision: the revision of the model to load. Defaults to None . -
tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None . -
tokenizer_mode: the mode to use for the tokenizer. Defaults to auto . -
tokenizer_revision: the revision of the tokenizer to load. Defaults to None . -
skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults to False . -
chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None . -
structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput . Defaults to None. -
seed: the seed to use for the random number generator. Defaults to 0 . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . -
_model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. -
_tokenizer: the tokenizer instance used to format the prompt before passing it to the LLM . This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. -
use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False . -
magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None . "},{"location":"components-gallery/llms/vllm/#runtime-parameters","title":"Runtime Parameters","text":" - extra_kwargs: additional dictionary of keyword arguments that will be passed to the
LLM class of vllm library. "},{"location":"components-gallery/llms/vllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vllm/#generate-text","title":"Generate text","text":"from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\",\n chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n "},{"location":"components-gallery/llms/vllm/#generate-structured-data","title":"Generate structured data","text":"from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n name: str\n last_name: str\n id: int\n\nllm = vLLM(\n model=\"prometheus-eval/prometheus-7b-v2.0\"\n structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n "},{"location":"components-gallery/embeddings/","title":"Embeddings Gallery","text":" -
SentenceTransformerEmbeddings sentence-transformers library implementation for embedding generation. SentenceTransformerEmbeddings -
vLLMEmbeddings vllm library implementation for embedding generation. vLLMEmbeddings "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/","title":"SentenceTransformerEmbeddings","text":"sentence-transformers library implementation for embedding generation. "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None . -
prompts: a dictionary containing prompts to be used with the model. Defaults to None . -
default_prompt_name: the default prompt (in prompts ) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None . -
trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False . -
revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\" . -
token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None . -
truncate_dim: the dimension to truncate the sentence embeddings. Defaults to None . -
model_kwargs: extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None . -
tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None . -
config_kwargs: extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None . -
precision: the dtype that will have the resulting embeddings. Defaults to \"float32\" . -
normalize_embeddings: whether to normalize the embeddings so they have a length of 1. Defaults to None . "},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n "},{"location":"components-gallery/embeddings/vllmembeddings/","title":"vLLMEmbeddings","text":"vllm library implementation for embedding generation. "},{"location":"components-gallery/embeddings/vllmembeddings/#attributes","title":"Attributes","text":" -
model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files. -
dtype: the data type to use for the model. Defaults to auto . -
trust_remote_code: whether to trust the remote code when loading the model. Defaults to False . -
quantization: the quantization mode to use for the model. Defaults to None . -
revision: the revision of the model to load. Defaults to None . -
enforce_eager: whether to enforce eager execution. Defaults to True . -
seed: the seed to use for the random number generator. Defaults to 0 . -
extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {} . -
_model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method. "},{"location":"components-gallery/embeddings/vllmembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/vllmembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n# [-0.05447685346007347, -0.01623094454407692, ...],\n# [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n "},{"location":"components-gallery/embeddings/vllmembeddings/#references","title":"References","text":" - Offline inference embeddings
"}]}
\ No newline at end of file
|